3 Commits

Author SHA1 Message Date
36d97eebec Merge branch 'feature/meeseeks-parser' into develop 2024-06-04 00:08:38 +02:00
4a25eae6c6 meeseeks in 2024-06-04 00:06:23 +02:00
757c1fe063 start rewrite 2024-06-03 19:33:10 +02:00
4 changed files with 69 additions and 36 deletions

View File

@ -1,8 +1,7 @@
defmodule SomethingErlang.AwfulApi do
require Logger
alias SomethingErlang.AwfulApi.Thread
alias SomethingErlang.AwfulApi.Bookmarks
alias SomethingErlang.AwfulApi.{Client, Thread, Bookmarks}
@doc """
Returns a list of all posts on page of a thread.
@ -16,7 +15,8 @@ defmodule SomethingErlang.AwfulApi do
12
"""
def parsed_thread(id, page, user) do
Thread.compile(id, page, user)
Client.thread_doc(id, page, user)
|> Thread.compile()
end
def bookmarks(user) do

View File

@ -1,24 +1,43 @@
defmodule SomethingErlang.AwfulApi.Thread do
import Meeseeks.CSS
require Logger
alias SomethingErlang.AwfulApi.Client
def compile(html) do
title =
Meeseeks.one(html, css("title"))
|> Meeseeks.text()
|> String.replace(" - The Something Awful Forums", "")
def compile(id, page, user) do
doc = Client.thread_doc(id, page, user)
html = Floki.parse_document!(doc)
thread = Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored")
thread =
Meeseeks.one(html, css("#thread"))
title = Floki.find(html, "title") |> Floki.text()
title = title |> String.replace(" - The Something Awful Forums", "")
# Floki.find(html, "#thread") |> Floki.filter_out("table.post.ignored")
thread_id =
Meeseeks.attr(thread, "class")
|> String.split(":")
|> List.last()
|> String.to_integer()
page =
Meeseeks.one(html, css("#content .pages.top option[selected]"))
|> Meeseeks.text()
|> case do
"" -> 1
s -> String.to_integer(s)
end
page_count =
case Floki.find(html, "#content .pages.top option:last-of-type") |> Floki.text() do
Meeseeks.one(html, css("#content .pages.top option:last-of-type"))
|> Meeseeks.text()
|> case do
"" -> 1
s -> String.to_integer(s)
end
posts =
for post <- Floki.find(thread, "table.post") do
for post <- Meeseeks.all(thread, css("table.post")) do
%{
userinfo: post |> userinfo(),
postdate: post |> postdate(),
@ -26,14 +45,18 @@ defmodule SomethingErlang.AwfulApi.Thread do
}
end
%{id: id, title: title, page: page, page_count: page_count, posts: posts}
%{id: thread_id, title: title, page: page, page_count: page_count, posts: posts}
end
defp userinfo(post) do
user = Floki.find(post, "dl.userinfo")
name = user |> Floki.find("dt") |> Floki.text()
regdate = user |> Floki.find("dd.registered") |> Floki.text()
title = user |> Floki.find("dd.title") |> List.first() |> Floki.children() |> Floki.raw_html()
user = Meeseeks.one(post, css("dl.userinfo"))
name = user |> Meeseeks.one(css("dt")) |> Meeseeks.text()
regdate = user |> Meeseeks.one(css("dd.registered")) |> Meeseeks.text()
title =
user
|> Meeseeks.one(css("dd.title > *"))
|> Meeseeks.html()
%{
name: name,
@ -43,12 +66,16 @@ defmodule SomethingErlang.AwfulApi.Thread do
end
defp postdate(post) do
date = Floki.find(post, "td.postdate") |> Floki.find("td.postdate") |> Floki.text()
date =
post
|> Meeseeks.one(css("td.postdate"))
|> Meeseeks.text()
[month_text, day, year, hours, minutes] =
date
|> String.split(~r{[\s,:]}, trim: true)
|> Enum.drop(1)
|> Enum.drop(2)
|> dbg()
month =
1 +
@ -68,23 +95,20 @@ defmodule SomethingErlang.AwfulApi.Thread do
end
defp postbody(post) do
body =
Floki.find(post, "td.postbody")
|> List.first()
|> Floki.filter_out(:comment)
{_, _, body} =
post
|> Meeseeks.one(css("td.postbody"))
|> Meeseeks.tree()
Floki.traverse_and_update(body, fn
{"img", attrs, []} -> transform(:img, attrs)
{"a", attrs, children} -> transform(:a, attrs, children)
other -> other
end)
|> Floki.children()
|> Floki.raw_html()
body
|> Enum.map(&transform/1)
|> Enum.reject(fn x -> x == "" end)
|> then(&{"div", [], &1})
|> Meeseeks.parse(:tuple_tree)
|> Meeseeks.html()
end
defp transform(elem, attr, children \\ [])
defp transform(:img, attrs, _children) do
defp transform({"img", attrs, _children}) do
{"class", class} = List.keyfind(attrs, "class", 0, {"class", ""})
if class == "sa-smilie" do
@ -95,7 +119,7 @@ defmodule SomethingErlang.AwfulApi.Thread do
end
end
defp transform(:a, attrs, children) do
defp transform({"a", attrs, children}) do
{"href", href} = List.keyfind(attrs, "href", 0, {"href", ""})
cond do
@ -124,6 +148,12 @@ defmodule SomethingErlang.AwfulApi.Thread do
end
end
defp transform({:comment, _}), do: ""
defp transform({tag, attrs, children}), do: {tag, attrs, children}
defp transform(text) when is_binary(text),
do: String.trim(text)
defp transform_link(:mp4, href),
do:
{"div", [{"class", "responsive-embed"}],

View File

@ -39,8 +39,7 @@ defmodule SomethingErlang.MixProject do
{:phoenix_html, "~> 4.0"},
{:phoenix_live_reload, "~> 1.2", only: :dev},
{:phoenix_live_view, "~> 0.20.2"},
# {:floki, ">= 0.30.0", only: :test},
{:floki, ">= 0.30.0"},
{:floki, ">= 0.30.0", only: :test},
{:phoenix_live_dashboard, "~> 0.8.3"},
{:esbuild, "~> 0.8", runtime: Mix.env() == :dev},
{:tailwind, "~> 0.2", runtime: Mix.env() == :dev},
@ -59,6 +58,7 @@ defmodule SomethingErlang.MixProject do
{:jason, "~> 1.2"},
{:dns_cluster, "~> 0.1.1"},
{:bandit, "~> 1.2"},
{:meeseeks, "~> 0.17.0"},
{:req, "~> 0.5.0"}
]
end

View File

@ -15,6 +15,8 @@
"heroicons": {:git, "https://github.com/tailwindlabs/heroicons.git", "88ab3a0d790e6a47404cba02800a6b25d2afae50", [tag: "v2.1.1", sparse: "optimized"]},
"hpax": {:hex, :hpax, "0.1.2", "09a75600d9d8bbd064cdd741f21fc06fc1f4cf3d0fcc335e5aa19be1a7235c84", [:mix], [], "hexpm", "2c87843d5a23f5f16748ebe77969880e29809580efdaccd615cd3bed628a8c13"},
"jason": {:hex, :jason, "1.4.1", "af1504e35f629ddcdd6addb3513c3853991f694921b1b9368b0bd32beb9f1b63", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "fbb01ecdfd565b56261302f7e1fcc27c4fb8f32d56eab74db621fc154604a7a1"},
"meeseeks": {:hex, :meeseeks, "0.17.0", "8a41ceccd2365476c2b779292e7649fb25f0a9735030905941f1244d2095c8a6", [:mix], [{:meeseeks_html5ever, "~> 0.14.3", [hex: :meeseeks_html5ever, repo: "hexpm", optional: false]}], "hexpm", "13efaf321a1517dea046cb48ff9baa9dc0604d9afd82c57501bc01dc45a5e309"},
"meeseeks_html5ever": {:hex, :meeseeks_html5ever, "0.14.3", "7827c6ce393d9f99dd0220c356fd66ee7101718037ec6f7f18d4bcba84ef1798", [:mix], [{:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.6.1", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "6b69573b97120fcc6e97045178ad085fd3ee10a5b49c1e9ebb8a28bd4a9c538b"},
"mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"},
"mint": {:hex, :mint, "1.6.0", "88a4f91cd690508a04ff1c3e28952f322528934be541844d54e0ceb765f01d5e", [:mix], [{:castore, "~> 0.1.0 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:hpax, "~> 0.1.1 or ~> 0.2.0", [hex: :hpax, repo: "hexpm", optional: false]}], "hexpm", "3c5ae85d90a5aca0a49c0d8b67360bbe407f3b54f1030a111047ff988e8fefaa"},
"nimble_options": {:hex, :nimble_options, "1.1.1", "e3a492d54d85fc3fd7c5baf411d9d2852922f66e69476317787a7b2bb000a61b", [:mix], [], "hexpm", "821b2470ca9442c4b6984882fe9bb0389371b8ddec4d45a9504f00a66f650b44"},
@ -31,6 +33,7 @@
"plug_crypto": {:hex, :plug_crypto, "2.1.0", "f44309c2b06d249c27c8d3f65cfe08158ade08418cf540fd4f72d4d6863abb7b", [:mix], [], "hexpm", "131216a4b030b8f8ce0f26038bc4421ae60e4bb95c5cf5395e1421437824c4fa"},
"postgrex": {:hex, :postgrex, "0.18.0", "f34664101eaca11ff24481ed4c378492fed2ff416cd9b06c399e90f321867d7e", [:mix], [{:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:decimal, "~> 1.5 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "a042989ba1bc1cca7383ebb9e461398e3f89f868c92ce6671feb7ef132a252d1"},
"req": {:hex, :req, "0.5.0", "6d8a77c25cfc03e06a439fb12ffb51beade53e3fe0e2c5e362899a18b50298b3", [:mix], [{:brotli, "~> 0.3.1", [hex: :brotli, repo: "hexpm", optional: true]}, {:ezstd, "~> 1.0", [hex: :ezstd, repo: "hexpm", optional: true]}, {:finch, "~> 0.17", [hex: :finch, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mime, "~> 1.6 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:nimble_csv, "~> 1.0", [hex: :nimble_csv, repo: "hexpm", optional: true]}, {:plug, "~> 1.0", [hex: :plug, repo: "hexpm", optional: true]}], "hexpm", "dda04878c1396eebbfdec6db6f3d4ca609e5c8846b7ee88cc56eb9891406f7a3"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.6.3", "f838d94bc35e1844973ee7266127b156fdc962e9e8b7ff666c8fb4fed7964d23", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "e18ecca3669a7454b3a2be75ae6c3ef01d550bc9a8cf5fbddcfff843b881d7c6"},
"swoosh": {:hex, :swoosh, "1.16.9", "20c6a32ea49136a4c19f538e27739bb5070558c0fa76b8a95f4d5d5ca7d319a1", [:mix], [{:bandit, ">= 1.0.0", [hex: :bandit, repo: "hexpm", optional: true]}, {:cowboy, "~> 1.1 or ~> 2.4", [hex: :cowboy, repo: "hexpm", optional: true]}, {:ex_aws, "~> 2.1", [hex: :ex_aws, repo: "hexpm", optional: true]}, {:finch, "~> 0.6", [hex: :finch, repo: "hexpm", optional: true]}, {:gen_smtp, "~> 0.13 or ~> 1.0", [hex: :gen_smtp, repo: "hexpm", optional: true]}, {:hackney, "~> 1.9", [hex: :hackney, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:mail, "~> 0.2", [hex: :mail, repo: "hexpm", optional: true]}, {:mime, "~> 1.1 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mua, "~> 0.2.0", [hex: :mua, repo: "hexpm", optional: true]}, {:multipart, "~> 0.4", [hex: :multipart, repo: "hexpm", optional: true]}, {:plug, "~> 1.9", [hex: :plug, repo: "hexpm", optional: true]}, {:plug_cowboy, ">= 1.0.0", [hex: :plug_cowboy, repo: "hexpm", optional: true]}, {:req, "~> 0.5 or ~> 1.0", [hex: :req, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.2 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "878b1a7a6c10ebbf725a3349363f48f79c5e3d792eb621643b0d276a38acc0a6"},
"tailwind": {:hex, :tailwind, "0.2.2", "9e27288b568ede1d88517e8c61259bc214a12d7eed271e102db4c93fcca9b2cd", [:mix], [{:castore, ">= 0.0.0", [hex: :castore, repo: "hexpm", optional: false]}], "hexpm", "ccfb5025179ea307f7f899d1bb3905cd0ac9f687ed77feebc8f67bdca78565c4"},
"telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"},