From b369c0862b284a002187a641c22977d4a0b2029d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Diedrich?= Date: Tue, 2 Aug 2022 10:28:16 +0200 Subject: [PATCH] rewrite old parsing logic; parse postdate --- lib/something_erlang/awful_api/awful_api.ex | 168 -------------------- lib/something_erlang/awful_api/thread.ex | 46 +++--- 2 files changed, 25 insertions(+), 189 deletions(-) diff --git a/lib/something_erlang/awful_api/awful_api.ex b/lib/something_erlang/awful_api/awful_api.ex index 876a7f8..0dffb9c 100644 --- a/lib/something_erlang/awful_api/awful_api.ex +++ b/lib/something_erlang/awful_api/awful_api.ex @@ -1,144 +1,8 @@ defmodule SomethingErlang.AwfulApi do require Logger - alias SomethingErlang.AwfulApi.Client alias SomethingErlang.AwfulApi.Thread - def parse_thread_userinfo(thread) do - {_, userinfos} = - Floki.find(thread, "td.userinfo dl.userinfo") - |> Floki.traverse_and_update([], fn - {"dt", [{"class", _class} | _rest], children}, acc -> - {nil, [{:name, Floki.text(children)} | acc]} - - {"dd", [{"class", "registered"} | _rest], children}, acc -> - {nil, [{:regdate, Floki.text(children)} | acc]} - - {"dd", [{"class", "title"} | _rest], children}, acc -> - {nil, [{:title, Floki.raw_html(children)} | acc]} - - other, acc -> - {other, acc} - end) - - Enum.chunk_every(userinfos, 3) - |> Enum.map(&Map.new/1) - |> Enum.reverse() - end - - def parse_thread_postdate(thread) do - {[], postdates} = - Floki.find(thread, "td.postdate") - |> Floki.traverse_and_update([], fn - {"td", [{"class", "postdate"}], children}, acc -> - {nil, [Floki.text(children) | acc]} - - other, acc -> - {other, acc} - end) - - Enum.reverse(postdates) - end - - def parse_thread_postbody(thread) do - {[], postbodies} = - Floki.find(thread, "td.postbody") - |> Floki.traverse_and_update(fn - {"img", attrs, []} -> transform(:img, attrs) - {"a", attrs, children} -> transform(:a, attrs, children) - {:comment, _} -> nil - other -> other - end) - # TODO: use Floki find or smth? - |> Floki.traverse_and_update([], fn - {"td", [{"class", "postbody"}], children}, acc -> - {nil, [Floki.raw_html(children) | acc]} - - other, acc -> - {other, acc} - end) - - Enum.reverse(postbodies) - end - - defp transform(elem, attr, children \\ []) - - defp transform(:img, attrs, _children) do - {"class", class} = List.keyfind(attrs, "class", 0, {"class", ""}) - if class == "sa-smilie" do - {"img", attrs, []} - else - t_attrs = List.keyreplace(attrs, "class", 0, {"class", "img-responsive"}) - {"img", [{"loading", "lazy"} | t_attrs], []} - end - end - - defp transform(:a, attrs, children) do - {"href", href} = List.keyfind(attrs, "href", 0, {"href", ""}) - cond do - # skip internal links - String.starts_with?(href, "/") -> - {"a", [{"href", href}], children} - - # mp4 - String.ends_with?(href, ".mp4") -> - transform_link(:mp4, href) - - # gifv - String.ends_with?(href, ".gifv") -> - transform_link(:gifv, href) - - # youtube - String.starts_with?(href, "https://www.youtube.com/watch") -> - transform_link(:ytlong, href) - - String.starts_with?(href, "https://youtu.be/") -> - transform_link(:ytshort, href) - - true -> - Logger.debug "no transform for #{href}" - {"a", [{"href", href}], children} - end - end - - defp transform_link(:mp4, href), - do: {"div", [{"class", "responsive-embed"}], - [{"video", [{"class", "img-responsive"}, {"controls", ""}], - [{"source", [{"src", href}, {"type", "video/mp4"}], []}] - }] - } - - defp transform_link(:gifv, href), - do: {"div", [{"class", "responsive-embed"}], - [{"video", [{"class", "img-responsive"}, {"controls", ""}], - [{"source", [{"src", String.replace(href, ".gifv", ".webm")}, - {"type", "video/webm"}], []}, - {"source", [{"src", String.replace(href, ".gifv", ".mp4")}, - {"type", "video/mp4"}], []}] - }] - } - - defp transform_link(:ytlong, href) do - String.replace(href, "/watch?v=", "/embed/") - |> youtube_iframe() - end - - defp transform_link(:ytshort, href) do - String.replace(href, "youtu.be/", "www.youtube.com/embed/") - |> youtube_iframe() - end - - defp youtube_iframe(src), - do: {"div", [{"class", "responsive-embed"}], - [{"iframe", - [ - {"class", "youtube-player"}, - {"loading", "lazy"}, - {"allow", "fullscreen"}, - {"src", src} - ], []} - ]} - @doc """ Returns a list of all posts on page of a thread. @@ -153,36 +17,4 @@ defmodule SomethingErlang.AwfulApi do def parsed_thread(id, page, user) do Thread.compile(id, page, user) end - - def parsed_thread(id, page, user, :deprecated) do - doc = Client.thread_doc(id, page, user) - html = Floki.parse_document!(doc) - thread = Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored") - |> tap(&IO.inspect(Floki.find(&1, "table.post") |> Enum.take(2))) - - title = Floki.find(html, "title") |> Floki.text() - title = title |> String.replace(" - The Something Awful Forums", "") - - page_count = - case Floki.find(html, "#content .pages.top option:last-of-type") |> Floki.text() do - "" -> 1 - s -> String.to_integer(s) - end - - posts = - Enum.zip([ - parse_thread_userinfo(thread), - parse_thread_postdate(thread), - parse_thread_postbody(thread) - ]) - |> Enum.map(fn {ui, pd, pb} -> - %{:userinfo => ui, :postdate => pd, :postbody => pb} - end) - - %{id: id, - title: title, - page: page, - page_count: page_count, - posts: posts} - end end diff --git a/lib/something_erlang/awful_api/thread.ex b/lib/something_erlang/awful_api/thread.ex index 397fb3c..76cd1d7 100644 --- a/lib/something_erlang/awful_api/thread.ex +++ b/lib/something_erlang/awful_api/thread.ex @@ -33,15 +33,13 @@ defmodule SomethingErlang.AwfulApi.Thread do posts: posts} end - def userinfo(post) do - - + defp userinfo(post) do user = Floki.find(post, "dl.userinfo") - user |> IO.inspect() - - name = user |> Floki.find("dt") |> Floki.text() |> IO.inspect() - regdate = user |> Floki.find("dd.registered") |> Floki.text() |> IO.inspect() - title = user |> Floki.find_and_update("dd.title", fn {"dd", attrs} -> {"div", attrs} end) |> Floki.raw_html() + name = user |> Floki.find("dt") |> Floki.text() + regdate = user |> Floki.find("dd.registered") |> Floki.text() + title = + user |> Floki.find("dd.title") |> List.first() + |> Floki.children() |> Floki.raw_html() %{ name: name, @@ -50,28 +48,34 @@ defmodule SomethingErlang.AwfulApi.Thread do } end - def postdate(post) do - _date = + defp postdate(post) do + date = Floki.find(post, "td.postdate") - |> Floki.find("td.postdate") |> Floki.children() |> Floki.text() + |> Floki.find("td.postdate") |> Floki.text() + + [month_text, day, year, hours, minutes] = date + |> String.split(~r{[\s,:]}, trim: true) + |> Enum.drop(1) + + month = 1 + Enum.find_index(["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], + fn m -> m == month_text end) + NaiveDateTime.new!(year |> String.to_integer(), month, day |> String.to_integer(), + hours |> String.to_integer(), minutes |> String.to_integer(), 0) end - def postbody(post) do - body = Floki.find(post, "td.postbody") + defp postbody(post) do + body = + Floki.find(post, "td.postbody") + |> List.first() + |> Floki.filter_out(:comment) Floki.traverse_and_update(body, fn {"img", attrs, []} -> transform(:img, attrs) {"a", attrs, children} -> transform(:a, attrs, children) - {:comment, _} -> nil other -> other end) - |> Floki.traverse_and_update([], fn - {"td", [{"class", "postbody"}], children}, acc -> - {nil, [Floki.raw_html(children) | acc]} - - other, acc -> - {other, acc} - end) + |> Floki.children() + |> Floki.raw_html() end defp transform(elem, attr, children \\ [])