From 757c1fe063b42c3f71c3c72c746e0e03bff1e853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Diedrich?= Date: Mon, 3 Jun 2024 19:33:10 +0200 Subject: [PATCH 1/2] start rewrite --- lib/something_erlang/awful_api/awful_api.ex | 6 +++--- lib/something_erlang/awful_api/thread.ex | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/something_erlang/awful_api/awful_api.ex b/lib/something_erlang/awful_api/awful_api.ex index e70cb9d..a11d903 100644 --- a/lib/something_erlang/awful_api/awful_api.ex +++ b/lib/something_erlang/awful_api/awful_api.ex @@ -1,8 +1,7 @@ defmodule SomethingErlang.AwfulApi do require Logger - alias SomethingErlang.AwfulApi.Thread - alias SomethingErlang.AwfulApi.Bookmarks + alias SomethingErlang.AwfulApi.{Client, Thread, Bookmarks} @doc """ Returns a list of all posts on page of a thread. @@ -16,7 +15,8 @@ defmodule SomethingErlang.AwfulApi do 12 """ def parsed_thread(id, page, user) do - Thread.compile(id, page, user) + Client.thread_doc(id, page, user) + |> Thread.compile() end def bookmarks(user) do diff --git a/lib/something_erlang/awful_api/thread.ex b/lib/something_erlang/awful_api/thread.ex index 8d00102..81f23e5 100644 --- a/lib/something_erlang/awful_api/thread.ex +++ b/lib/something_erlang/awful_api/thread.ex @@ -3,8 +3,7 @@ defmodule SomethingErlang.AwfulApi.Thread do alias SomethingErlang.AwfulApi.Client - def compile(id, page, user) do - doc = Client.thread_doc(id, page, user) + def compile(doc) do html = Floki.parse_document!(doc) thread = Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored") From 4a25eae6c6db1584c23141e8c7a97be1f3faae98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=BCdiger=20Diedrich?= Date: Tue, 4 Jun 2024 00:06:23 +0200 Subject: [PATCH 2/2] meeseeks in --- lib/something_erlang/awful_api/thread.ex | 91 ++++++++++++++++-------- mix.exs | 4 +- mix.lock | 3 + 3 files changed, 66 insertions(+), 32 deletions(-) diff --git a/lib/something_erlang/awful_api/thread.ex b/lib/something_erlang/awful_api/thread.ex index 81f23e5..1ca3d41 100644 --- a/lib/something_erlang/awful_api/thread.ex +++ b/lib/something_erlang/awful_api/thread.ex @@ -1,23 +1,43 @@ defmodule SomethingErlang.AwfulApi.Thread do + import Meeseeks.CSS + require Logger - alias SomethingErlang.AwfulApi.Client + def compile(html) do + title = + Meeseeks.one(html, css("title")) + |> Meeseeks.text() + |> String.replace(" - The Something Awful Forums", "") - def compile(doc) do - html = Floki.parse_document!(doc) - thread = Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored") + thread = + Meeseeks.one(html, css("#thread")) - title = Floki.find(html, "title") |> Floki.text() - title = title |> String.replace(" - The Something Awful Forums", "") + # Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored") + + thread_id = + Meeseeks.attr(thread, "class") + |> String.split(":") + |> List.last() + |> String.to_integer() + + page = + Meeseeks.one(html, css("#content .pages.top option[selected]")) + |> Meeseeks.text() + |> case do + "" -> 1 + s -> String.to_integer(s) + end page_count = - case Floki.find(html, "#content .pages.top option:last-of-type") |> Floki.text() do + Meeseeks.one(html, css("#content .pages.top option:last-of-type")) + |> Meeseeks.text() + |> case do "" -> 1 s -> String.to_integer(s) end posts = - for post <- Floki.find(thread, "table.post") do + for post <- Meeseeks.all(thread, css("table.post")) do %{ userinfo: post |> userinfo(), postdate: post |> postdate(), @@ -25,14 +45,18 @@ defmodule SomethingErlang.AwfulApi.Thread do } end - %{id: id, title: title, page: page, page_count: page_count, posts: posts} + %{id: thread_id, title: title, page: page, page_count: page_count, posts: posts} end defp userinfo(post) do - user = Floki.find(post, "dl.userinfo") - name = user |> Floki.find("dt") |> Floki.text() - regdate = user |> Floki.find("dd.registered") |> Floki.text() - title = user |> Floki.find("dd.title") |> List.first() |> Floki.children() |> Floki.raw_html() + user = Meeseeks.one(post, css("dl.userinfo")) + name = user |> Meeseeks.one(css("dt")) |> Meeseeks.text() + regdate = user |> Meeseeks.one(css("dd.registered")) |> Meeseeks.text() + + title = + user + |> Meeseeks.one(css("dd.title > *")) + |> Meeseeks.html() %{ name: name, @@ -42,12 +66,16 @@ defmodule SomethingErlang.AwfulApi.Thread do end defp postdate(post) do - date = Floki.find(post, "td.postdate") |> Floki.find("td.postdate") |> Floki.text() + date = + post + |> Meeseeks.one(css("td.postdate")) + |> Meeseeks.text() [month_text, day, year, hours, minutes] = date |> String.split(~r{[\s,:]}, trim: true) - |> Enum.drop(1) + |> Enum.drop(2) + |> dbg() month = 1 + @@ -67,23 +95,20 @@ defmodule SomethingErlang.AwfulApi.Thread do end defp postbody(post) do - body = - Floki.find(post, "td.postbody") - |> List.first() - |> Floki.filter_out(:comment) + {_, _, body} = + post + |> Meeseeks.one(css("td.postbody")) + |> Meeseeks.tree() - Floki.traverse_and_update(body, fn - {"img", attrs, []} -> transform(:img, attrs) - {"a", attrs, children} -> transform(:a, attrs, children) - other -> other - end) - |> Floki.children() - |> Floki.raw_html() + body + |> Enum.map(&transform/1) + |> Enum.reject(fn x -> x == "" end) + |> then(&{"div", [], &1}) + |> Meeseeks.parse(:tuple_tree) + |> Meeseeks.html() end - defp transform(elem, attr, children \\ []) - - defp transform(:img, attrs, _children) do + defp transform({"img", attrs, _children}) do {"class", class} = List.keyfind(attrs, "class", 0, {"class", ""}) if class == "sa-smilie" do @@ -94,7 +119,7 @@ defmodule SomethingErlang.AwfulApi.Thread do end end - defp transform(:a, attrs, children) do + defp transform({"a", attrs, children}) do {"href", href} = List.keyfind(attrs, "href", 0, {"href", ""}) cond do @@ -123,6 +148,12 @@ defmodule SomethingErlang.AwfulApi.Thread do end end + defp transform({:comment, _}), do: "" + defp transform({tag, attrs, children}), do: {tag, attrs, children} + + defp transform(text) when is_binary(text), + do: String.trim(text) + defp transform_link(:mp4, href), do: {"div", [{"class", "responsive-embed"}], diff --git a/mix.exs b/mix.exs index 15a0f66..79994aa 100644 --- a/mix.exs +++ b/mix.exs @@ -39,8 +39,7 @@ defmodule SomethingErlang.MixProject do {:phoenix_html, "~> 4.0"}, {:phoenix_live_reload, "~> 1.2", only: :dev}, {:phoenix_live_view, "~> 0.20.2"}, - # {:floki, ">= 0.30.0", only: :test}, - {:floki, ">= 0.30.0"}, + {:floki, ">= 0.30.0", only: :test}, {:phoenix_live_dashboard, "~> 0.8.3"}, {:esbuild, "~> 0.8", runtime: Mix.env() == :dev}, {:tailwind, "~> 0.2", runtime: Mix.env() == :dev}, @@ -59,6 +58,7 @@ defmodule SomethingErlang.MixProject do {:jason, "~> 1.2"}, {:dns_cluster, "~> 0.1.1"}, {:bandit, "~> 1.2"}, + {:meeseeks, "~> 0.17.0"}, {:req, "~> 0.5.0"} ] end diff --git a/mix.lock b/mix.lock index a1c37fe..294e5a8 100644 --- a/mix.lock +++ b/mix.lock @@ -15,6 +15,8 @@ "heroicons": {:git, "https://github.com/tailwindlabs/heroicons.git", "88ab3a0d790e6a47404cba02800a6b25d2afae50", [tag: "v2.1.1", sparse: "optimized"]}, "hpax": {:hex, :hpax, "0.1.2", "09a75600d9d8bbd064cdd741f21fc06fc1f4cf3d0fcc335e5aa19be1a7235c84", [:mix], [], "hexpm", "2c87843d5a23f5f16748ebe77969880e29809580efdaccd615cd3bed628a8c13"}, "jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"}, + "meeseeks": {:hex, :meeseeks, "0.17.0", "8a41ceccd2365476c2b779292e7649fb25f0a9735030905941f1244d2095c8a6", [:mix], [{:meeseeks_html5ever, "~> 0.14.3", [hex: :meeseeks_html5ever, repo: "hexpm", optional: false]}], "hexpm", "13efaf321a1517dea046cb48ff9baa9dc0604d9afd82c57501bc01dc45a5e309"}, + "meeseeks_html5ever": {:hex, :meeseeks_html5ever, "0.14.3", "7827c6ce393d9f99dd0220c356fd66ee7101718037ec6f7f18d4bcba84ef1798", [:mix], [{:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.6.1", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "6b69573b97120fcc6e97045178ad085fd3ee10a5b49c1e9ebb8a28bd4a9c538b"}, "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"}, "mint": {:hex, :mint, "1.6.0", "88a4f91cd690508a04ff1c3e28952f322528934be541844d54e0ceb765f01d5e", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "3c5ae85d90a5aca0a49c0d8b67360bbe407f3b54f1030a111047ff988e8fefaa"}, "nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"}, @@ -31,6 +33,7 @@ "plug_crypto": {:hex, :plug_crypto, "2.1.0", "f44309c2b06d249c27c8d3f65cfe08158ade08418cf540fd4f72d4d6863abb7b", [:mix], [], "hexpm", "131216a4b030b8f8ce0f26038bc4421ae60e4bb95c5cf5395e1421437824c4fa"}, "postgrex": {:hex, :postgrex, "0.18.0", "f34664101eaca11ff24481ed4c378492fed2ff416cd9b06c399e90f321867d7e", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "a042989ba1bc1cca7383ebb9e461398e3f89f868c92ce6671feb7ef132a252d1"}, "req": {:hex, :req, "0.5.0", "6d8a77c25cfc03e06a439fb12ffb51beade53e3fe0e2c5e362899a18b50298b3", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "dda04878c1396eebbfdec6db6f3d4ca609e5c8846b7ee88cc56eb9891406f7a3"}, + "rustler_precompiled": {:hex, :rustler_precompiled, "0.6.3", "f838d94bc35e1844973ee7266127b156fdc962e9e8b7ff666c8fb4fed7964d23", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "e18ecca3669a7454b3a2be75ae6c3ef01d550bc9a8cf5fbddcfff843b881d7c6"}, "swoosh": {:hex, :swoosh, "1.16.9", "20c6a32ea49136a4c19f538e27739bb5070558c0fa76b8a95f4d5d5ca7d319a1", [:mix], [{:bandit, ">= 1.0.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:cowboy, "~> 1.1 or ~> 2.4", [hex: :cowboy, repo: "hexpm", optional: true]}, {:ex_aws, "~> 2.1", [hex: :ex_aws, repo: "hexpm", optional: true]}, {:finch, "~> 0.6", [hex: :finch, repo: "hexpm", optional: true]}, {:gen_smtp, "~> 0.13 or ~> 1.0", [hex: :gen_smtp, repo: "hexpm", optional: true]}, {:hackney, "~> 1.9", [hex: :hackney, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mail, "~> 0.2", [hex: :mail, repo: "hexpm", optional: true]}, {:mime, "~> 1.1 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mua, "~> 0.2.0", [hex: :mua, repo: "hexpm", optional: true]}, {:multipart, "~> 0.4", [hex: :multipart, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: true]}, {:plug_cowboy, ">= 1.0.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:req, "~> 0.5 or ~> 1.0", [hex: :req, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.2 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "878b1a7a6c10ebbf725a3349363f48f79c5e3d792eb621643b0d276a38acc0a6"}, "tailwind": {:hex, :tailwind, "0.2.2", "9e27288b568ede1d88517e8c61259bc214a12d7eed271e102db4c93fcca9b2cd", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}], "hexpm", "ccfb5025179ea307f7f899d1bb3905cd0ac9f687ed77feebc8f67bdca78565c4"}, "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},