#!/usr/bin/env stack > -- stack script --resolver lts-22.6 --package text --package bytestring --package process --package tagsoup --package network-uri --package time txtify.lhs: pull the main text out of a web page, over gopher ============================================================= Feed it the URL of a web page; it hands back the page's most likely main textual content as plaintext --- no markup, no chrome, no JavaScript. The same shape as `atomize.lhs`: a type-7 search box and a docs landing menu, plus a raw type-0 endpoint that streams the extracted text so the whole client is `curl`. curl gopher://gopher.someodd.zip/0/applets/txtify.lhs/example.com/some-article > article.txt (Markdown-flavoured literate Haskell. Headings use setext underlines rather than ATX `#`, because GHC's literate parser reads a `#` at column 1 of a non-code line as a pragma.) What "reliable" can and can't mean here --------------------------------------- The extraction is a deterministic heuristic, not a browser. It works well on server-rendered article pages (blogs, docs, most news that ships HTML). It will *not* work where the article text only exists after JavaScript runs, or sits behind a paywall / consent wall --- the New York Times being the classic example: a plain `curl` receives a shell page or a teaser, so there is no body text in the bytes to extract. When that happens txtify says so plainly rather than emitting garbage. This is a property of fetching-without-a-browser, not something a cleverer parser fixes. How the heuristic works ----------------------- 0. SSRF guard --- http/https only; refuse localhost, RFC-1918, link-local, CGNAT and the cloud-metadata address. curl is capped on time, redirects and response size. 1. Fetch the page (browser-ish UA, gzip allowed). 2. Drop whole "chrome" regions: script/style/nav/header/footer/ aside/form/figure and friends, contents and all. 3. From what's left, collect block-level chunks (p, h1-h6, li, blockquote, pre) with their text and the share of that text that sat inside `` (link density). 4. Keep headings, and keep blocks that are long enough and not mostly links --- which discards menus, tag clouds and bylines. 5. Render the survivors as wrapped plaintext, titled from ``. If nothing substantial survives, say why. Determinism: nothing reads the wall clock or makes a random choice; identical input bytes give identical output. URL design (mirrors atomize.lhs) -------------------------------- $SCRIPT landing menu (search prompt + docs) $SCRIPT + type-7 query a report menu: how it went + a type-0 link and a curl one-liner $SCRIPT/<page-url> the raw extracted text (one request) The page URL rides in path-info as one opaque string (its own slashes survive); a bare `host/path` defaults to `https://`. Running the doctests -------------------- doctest-lhs txtify.lhs Module header and imports ------------------------- > {-# LANGUAGE OverloadedStrings #-} > module Main (main) where > > import Control.Exception (SomeException, try) > import Data.Char (isDigit) > import Data.Maybe (fromMaybe) > import qualified Data.Text as T > import qualified Data.Text.IO as TIO > import Network.URI (escapeURIString, isUnreserved, > parseURI, unEscapeString, uriAuthority, > uriRegName) > import System.Environment (getArgs, lookupEnv) > import System.Exit (ExitCode (..)) > import System.IO (BufferMode (..), hFlush, hSetBuffering, > hSetEncoding, stdout, utf8) > import System.Process (readProcessWithExitCode) > import Text.HTML.TagSoup (Tag (..), parseTags) > import Text.HTML.TagSoup.Entity (lookupEntity) Defaults -------- Host/port overridable via `GOPHER_HOST` / `GOPHER_PORT` so the same script serves staging and production unedited. > defaultHost, defaultPort :: T.Text > defaultHost = "gopher.someodd.zip" > defaultPort = "70" > > -- | Wrap column for rendered paragraphs. > wrapCol :: Int > wrapCol = 76 > > -- | A block must have at least this many characters of non-link text > -- to be kept (headings are exempt). Filters bylines, captions, > -- single-word nav remnants. > minBlockChars :: Int > minBlockChars = 40 > > -- | If the whole extraction comes to fewer than this many characters > -- we flag it as "probably JS-rendered or paywalled" --- there was > -- markup, but almost no prose in it. > thinPageChars :: Int > thinPageChars = 250 Request parsing --------------- Four positional argv from Venusia (`$selector`, `$search`, `$pathinfo`, `$remote_ip`); we ignore the IP via the cons pattern. > data Req = Req > { reqSel :: T.Text > , reqQ :: T.Text > , reqP :: T.Text > } deriving Show > > parseArgs :: [String] -> Req > parseArgs (s:q:p:_) = Req (T.pack s) (T.pack q) (T.pack p) > parseArgs (s:q:_) = Req (T.pack s) (T.pack q) T.empty > parseArgs [s] = Req (T.pack s) T.empty T.empty > parseArgs [] = error > "txtify.lhs: missing argv[0] (gopher selector). Run via Venusia, \ > \or for manual testing pass selector + query/pathinfo explicitly." > > data Ctx = Ctx > { ctxScriptSel :: T.Text > , ctxHost :: T.Text > , ctxPort :: T.Text > } Main dispatch ------------- `stdout` is `NoBuffering` per house convention; the body is wrapped in `try` so an uncaught exception becomes a visible type-3 row on the menu paths. The raw path handles its own failures as a plain message, so they never reach this catch. > main :: IO () > main = do > hSetBuffering stdout NoBuffering > hSetEncoding stdout utf8 > result <- try mainBody :: IO (Either SomeException ()) > case result of > Right () -> pure () > Left e -> mapM_ putLine > [ errorItem ("txtify.lhs crashed: " <> T.pack (show e)) > , terminator > ] > > mainBody :: IO () > mainBody = do > req <- parseArgs <$> getArgs > host <- maybe defaultHost T.pack <$> lookupEnv "GOPHER_HOST" > port <- maybe defaultPort T.pack <$> lookupEnv "GOPHER_PORT" > let pinfo = reqP req > scriptSel = T.dropEnd (T.length pinfo) (reqSel req) > ctx = Ctx scriptSel host port > q = T.strip (reqQ req) > -- The page URL rides in path-info as ONE opaque string; never > -- split on '/', so a URL's own slashes survive. > pTarget = T.dropWhile (== '/') pinfo > if not (T.null pTarget) > then emitRaw (normalizeTargetUrl (T.pack (unEscapeString (T.unpack pTarget)))) > else if T.null q > then emitLanding ctx > else emitReport ctx (normalizeTargetUrl q) URL normalisation ----------------- > -- | Trim, repair a single-slashed scheme, and default a scheme-less > -- input to https. Decoding from path-info happens before this. > -- > -- >>> normalizeTargetUrl " example.com/blog " > -- "https://example.com/blog" > -- > -- >>> normalizeTargetUrl "http://example.com" > -- "http://example.com" > -- > -- >>> normalizeTargetUrl "https:/example.com" > -- "https://example.com" > normalizeTargetUrl :: T.Text -> T.Text > normalizeTargetUrl raw = > let s = repairScheme (T.strip raw) > in if hasScheme s then s else "https://" <> s > > -- | >>> hasScheme "http://x" > -- True > -- > -- >>> hasScheme "x.com" > -- False > hasScheme :: T.Text -> Bool > hasScheme s = let l = T.toLower s > in "http://" `T.isPrefixOf` l || "https://" `T.isPrefixOf` l > > -- | Repair a scheme whose `//` got collapsed to `/` (happens when a > -- URL is typed unencoded into path-info and empty segments drop). > -- > -- >>> repairScheme "http:/example.com" > -- "http://example.com" > -- > -- >>> repairScheme "https://ok.com" > -- "https://ok.com" > repairScheme :: T.Text -> T.Text > repairScheme s > | "https://" `T.isPrefixOf` l = s > | "http://" `T.isPrefixOf` l = s > | "https:/" `T.isPrefixOf` l = "https://" <> T.drop 7 s > | "http:/" `T.isPrefixOf` l = "http://" <> T.drop 6 s > | otherwise = s > where l = T.toLower s SSRF guard ---------- > -- | The host component of a (normalised) URL, lower-cased. > -- > -- >>> hostOf "https://Example.com/x" > -- Just "example.com" > -- > -- >>> hostOf "not a url" > -- Nothing > hostOf :: T.Text -> Maybe T.Text > hostOf u = do > uri <- parseURI (T.unpack (normalizeTargetUrl u)) > auth <- uriAuthority uri > let h = T.toLower (T.pack (uriRegName auth)) > if T.null h then Nothing else Just h > > -- | True for hosts we refuse to fetch: localhost, RFC-1918, CGNAT, > -- link-local, the cloud-metadata IP, and `.local` mDNS names. > -- > -- >>> isBlockedHost "localhost" > -- True > -- > -- >>> isBlockedHost "192.168.1.10" > -- True > -- > -- >>> isBlockedHost "169.254.169.254" > -- True > -- > -- >>> isBlockedHost "example.com" > -- False > isBlockedHost :: T.Text -> Bool > isBlockedHost h0 = > let h = T.toLower h0 > in h `elem` ["", "localhost", "0.0.0.0", "::1", "[::1]", "metadata"] > || ".local" `T.isSuffixOf` h > || "127." `T.isPrefixOf` h > || "10." `T.isPrefixOf` h > || "192.168." `T.isPrefixOf` h > || "169.254." `T.isPrefixOf` h > || "100.64." `T.isPrefixOf` h > || in172 h > where > -- 172.16.0.0 .. 172.31.255.255 > in172 h = case T.splitOn "." h of > (a:b:_) | a == "172" > , all isDigit (T.unpack b) > , not (T.null b) > , let n = read (T.unpack b) :: Int > -> n >= 16 && n <= 31 > _ -> False Fetching -------- curl with a tight envelope: http/https only, capped redirects, time and response size, gzip allowed, and a browser-ish User-Agent (some sites 403 an obvious bot). A non-zero exit (including `--max-filesize` overflow) yields `Nothing`. > fetchUrl :: T.Text -> IO (Maybe T.Text) > fetchUrl url = case hostOf url of > Just h | not (isBlockedHost h) -> do > (ec, out, _) <- readProcessWithExitCode "curl" > [ "-sSL", "--compressed", "--proto", "=http,https", "--max-redirs", "5" > , "--connect-timeout", "5", "--max-time", "20" > , "--max-filesize", "5000000" > , "-A", "Mozilla/5.0 (compatible; txtify.lhs; gopher text extractor)" > , "--", T.unpack url ] "" > pure $ case ec of > ExitSuccess | not (null out) -> Just (T.pack out) > _ -> Nothing > _ -> pure Nothing Tag-soup helpers ---------------- > lc :: T.Text -> T.Text > lc = T.toLower > > isOpen, isClose :: T.Text -> Tag T.Text -> Bool > isOpen n (TagOpen t _) = lc t == n > isOpen _ _ = False > isClose n (TagClose t) = lc t == n > isClose _ _ = False > > -- | Concatenated text directly inside the first `<name>...</name>`. > innerText :: T.Text -> [Tag T.Text] -> T.Text > innerText name tags = > case dropWhile (not . isOpen name) tags of > (_:rest) -> T.strip . T.concat $ > [ s | TagText s <- takeWhile (not . isClose name) rest ] > [] -> "" Region and block vocabulary --------------------------- > -- | Elements whose entire subtree is chrome we never want as text. > junkTags :: [T.Text] > junkTags = > [ "script", "style", "noscript", "template", "svg", "head" > , "nav", "header", "footer", "aside", "form", "button" > , "select", "option", "iframe", "figure", "figcaption" ] > > -- | Block-level elements we treat as text chunks. > blockTags :: [T.Text] > blockTags = > [ "p", "h1", "h2", "h3", "h4", "h5", "h6" > , "li", "blockquote", "pre", "dd" ] > > -- | >>> isHeadingTag "h2" > -- True > -- > -- >>> isHeadingTag "p" > -- False > isHeadingTag :: T.Text -> Bool > isHeadingTag t = t `elem` ["h1", "h2", "h3", "h4", "h5", "h6"] Block extraction ---------------- A single left-to-right pass over the flat tag list. We carry a stack of currently-open junk tags (skip everything while non-empty), the block currently being captured (if any), and the anchor nesting depth inside that block (for link-density). Nested block opens are folded into the enclosing block --- an approximation that is fine for a heuristic and keeps the walk linear. > data Block = Block > { blkTag :: T.Text -- ^ normalised element name > , blkText :: T.Text -- ^ all text captured in the block > , blkLink :: T.Text -- ^ the subset of that text inside <a> > } > > extractBlocks :: [Tag T.Text] -> [Block] > extractBlocks = go [] Nothing 0 > where > go :: [T.Text] -> Maybe Block -> Int -> [Tag T.Text] -> [Block] > go _ cur _ [] = flush cur [] > go junk cur aDepth (tg:ts) = case tg of > TagOpen rawN _ > | nm `elem` junkTags -> go (nm : junk) cur aDepth ts > | not (null junk) -> go junk cur aDepth ts > | nm == "a" -> go junk cur (anchorBump cur aDepth) ts > | nm == "br", Just b <- cur -> go junk (Just b { blkText = blkText b <> " " }) aDepth ts > | nm `elem` blockTags -> case cur of > Nothing -> go junk (Just (Block nm "" "")) 0 ts -- start fresh > Just _ -> go junk cur aDepth ts -- fold into enclosing > | otherwise -> go junk cur aDepth ts > TagClose _ > | nm `elem` junkTags -> go (dropFirst nm junk) cur aDepth ts > | not (null junk) -> go junk cur aDepth ts > | nm == "a" && aDepth > 0 -> go junk cur (aDepth - 1) ts > | Just b <- cur, nm == blkTag b -> flush cur (go junk Nothing 0 ts) > | otherwise -> go junk cur aDepth ts > TagText s > | not (null junk) -> go junk cur aDepth ts > | otherwise -> case cur of > Just b -> let b' = b { blkText = blkText b <> s > , blkLink = if aDepth > 0 > then blkLink b <> s > else blkLink b } > in go junk (Just b') aDepth ts > Nothing -> go junk cur aDepth ts > _ -> go junk cur aDepth ts > where nm = case tg of > TagOpen x _ -> lc x > TagClose x -> lc x > _ -> "" > > -- Count an anchor only when we're actually inside a block. > anchorBump Nothing d = d > anchorBump (Just _) d = d + 1 > > flush Nothing rest = rest > flush (Just b) rest = b : rest > > -- | Drop the first occurrence of an element (the open junk tag we're > -- now closing) from the open-junk stack. > -- > -- >>> dropFirst "nav" ["nav","div"] > -- ["div"] > dropFirst :: Eq a => a -> [a] -> [a] > dropFirst _ [] = [] > dropFirst x (y:ys) > | x == y = ys > | otherwise = y : dropFirst x ys Scoring and keep-decision ------------------------- > -- | Fraction of a block's text that sat inside links. 1.0 for an > -- empty block (so it's never kept by the length rule). > -- > -- >>> linkDensity (Block "p" "hello world" "") > -- 0.0 > -- > -- >>> linkDensity (Block "li" "abcd" "abcd") > -- 1.0 > linkDensity :: Block -> Double > linkDensity b = > let tot = fromIntegral (T.length (collapseWs (blkText b))) > lnk = fromIntegral (T.length (collapseWs (blkLink b))) > in if tot <= 0 then 1 else lnk / tot > > -- | Keep headings with any text; keep other blocks only when they > -- carry enough non-link prose to look like content. > keepBlock :: Block -> Bool > keepBlock b = > let txt = collapseWs (decodeEntities (blkText b)) > in not (T.null txt) && > ( isHeadingTag (blkTag b) > || (T.length txt >= minBlockChars && linkDensity b < 0.5) ) Rendering to plaintext ---------------------- > renderBlock :: Block -> T.Text > renderBlock b > | blkTag b == "pre" = > T.intercalate "\n" (map (" " <>) (T.lines (T.strip (decodeEntities (blkText b))))) > | isHeadingTag (blkTag b) = txt > | blkTag b == "li" = wrapText wrapCol ("- " <> txt) > | otherwise = wrapText wrapCol txt > where txt = collapseWs (decodeEntities (blkText b)) > > -- | Build the whole document: title, rule, source line, then the > -- kept blocks separated by blank lines. Empty when nothing survives. > renderPlain :: T.Text -> T.Text -> [Block] -> T.Text > renderPlain title url blocks = > let kept = filter keepBlock blocks > t = collapseWs (decodeEntities title) > title' = if T.null t then url else t > rule = T.replicate (max 3 (min wrapCol (T.length title'))) "=" > hdr = T.intercalate "\n" [ title', rule, "", "Source: " <> url ] > body = T.intercalate "\n\n" (map renderBlock kept) > in if null kept then "" else hdr <> "\n\n" <> body <> "\n" > > -- | The pure pipeline: HTML in, plaintext out ("" on no content). > -- > -- >>> plainifyHtml "https://x/a" "<html><head><title>Hi

Heads up

This is the actual body paragraph and it is plainly long enough to keep.

" > -- "Hi\n===\n\nSource: https://x/a\n\nHeads up\n\nThis is the actual body paragraph and it is plainly long enough to keep.\n" > plainifyHtml :: T.Text -> T.Text -> T.Text > plainifyHtml url html = > let tags = parseTags html > in renderPlain (innerText "title" tags) url (extractBlocks tags) The deriving step (IO) ---------------------- > data Outcome = Outcome > { ocNote :: T.Text -- ^ how it went, for the report page > , ocText :: Maybe T.Text -- ^ the extracted plaintext on success > } > > deriveText :: T.Text -> IO Outcome > deriveText url = case hostOf url of > Nothing -> pure (Outcome "could not parse that as an http(s) URL" Nothing) > Just h | isBlockedHost h -> > pure (Outcome ("refused: \"" <> h <> "\" is a local/private address") Nothing) > Just _ -> do > mBody <- fetchUrl url > case mBody of > Nothing -> pure (Outcome "could not fetch the page (timeout, too big, or error)" Nothing) > Just body -> > let txt = plainifyHtml url body > in if T.null txt > then pure (Outcome > "found no main text block -- the page is likely \ > \JS-rendered or paywalled (plain curl runs no JavaScript)" Nothing) > else > let wc = length (T.words txt) > in if T.length txt < thinPageChars > then pure (Outcome > ("extracted very little (~" <> T.pack (show wc) <> " words) -- \ > \likely a JS-rendered or paywalled page (e.g. NYT)") (Just txt)) > else pure (Outcome > ("extracted ~" <> T.pack (show wc) <> " words") (Just txt)) Raw endpoint ------------ The hero path: stream the extracted text as raw bytes, no gopher terminator. On failure we still emit human-readable plaintext (so a shell `curl` shows the reason rather than an empty file). > emitRaw :: T.Text -> IO () > emitRaw url = do > Outcome note mTxt <- deriveText url > case mTxt of > Just txt -> TIO.putStr txt > Nothing -> TIO.putStr $ T.intercalate "\n" > [ "txtify: could not extract text from " <> url > , "" > , note > , "" ] Report page (type-7 result -> a menu) ------------------------------------- > emitReport :: Ctx -> T.Text -> IO () > emitReport ctx url = do > Outcome note mTxt <- deriveText url > putLine (infoLine "txtify -- extract the main text of a web page.") > putLine (infoLine ("Target: " <> url)) > putLine (infoLine "") > case mTxt of > Just _ -> do > let enc = percentEncodeUrl url > rawSel = ctxScriptSel ctx <> "/" <> enc > portPart = if ctxPort ctx == "70" then "" else ":" <> ctxPort ctx > rawUri = "gopher://" <> ctxHost ctx <> portPart <> "/0" <> rawSel > putLine (infoLine note) > putLine (infoLine "") > putLine (menuRow '0' "Read the extracted text" rawSel > (ctxHost ctx) (ctxPort ctx)) > putLine (infoLine "One-shot from a shell:") > putLine (infoLine (" curl " <> rawUri <> " > article.txt")) > Nothing -> do > putLine (errorItem ("Failed: " <> note)) > putLine (infoLine "") > putLine (infoLine "Try the article's own page (not an index), or a") > putLine (infoLine "site that ships its text as HTML rather than JS.") > putLine terminator Landing page ------------ > emitLanding :: Ctx -> IO () > emitLanding ctx = do > let portPart = if ctxPort ctx == "70" then "" else ":" <> ctxPort ctx > base = "gopher://" <> ctxHost ctx <> portPart <> "/0" <> ctxScriptSel ctx > mapM_ putLine > [ infoLine "txtify -- pull the main text out of any web page." > , infoLine "Deterministic heuristic: strip chrome, keep the prose," > , infoLine "hand it back as plaintext. No JS, so JS-only/paywalled" > , infoLine "pages (e.g. NYT) will come back thin -- it'll tell you." > , infoLine "" > , searchLine "Paste a web-page URL" (ctxScriptSel ctx) (ctxHost ctx) (ctxPort ctx) > , infoLine "" > , infoLine "Or go straight to the raw text from a shell:" > , infoLine (" curl " <> base <> "/example.com/some-article") > , infoLine "(the URL is the selector; bare host/path defaults to https)" > ] > putLine terminator Selector encoding ----------------- The page URL becomes one opaque path-info segment: escape everything but the URI-unreserved set, so its slashes and colon survive intact. > -- | >>> percentEncodeUrl "https://a.com/b?x=1" > -- "https%3A%2F%2Fa.com%2Fb%3Fx%3D1" > percentEncodeUrl :: T.Text -> T.Text > percentEncodeUrl = T.pack . escapeURIString isUnreserved . T.unpack Text utilities -------------- > -- | Collapse all runs of whitespace (incl. NBSP) to single spaces > -- and trim. Pure. > -- > -- >>> collapseWs " a\t b\n c " > -- "a b c" > collapseWs :: T.Text -> T.Text > collapseWs = T.unwords . T.words . T.map deNbsp > where deNbsp c = if c == '\160' then ' ' else c > > -- | Greedy word-wrap to a column width. > -- > -- >>> wrapText 10 "the quick brown fox" > -- "the quick\nbrown fox" > wrapText :: Int -> T.Text -> T.Text > wrapText w = T.intercalate "\n" . wrapWords . T.words > where > wrapWords [] = [] > wrapWords (x:xs) = go x xs > go line [] = [line] > go line (y:ys) > | T.length line + 1 + T.length y <= w = go (line <> " " <> y) ys > | otherwise = line : go y ys > > -- | Decode HTML entities (named, decimal, hex). A safety net: if the > -- parser already decoded them there are no `&name;` left to touch, > -- so this is idempotent on normal text. > -- > -- >>> decodeEntities "Tom & Jerry 'hi' & co" > -- "Tom & Jerry 'hi' & co" > decodeEntities :: T.Text -> T.Text > decodeEntities = T.pack . decode . T.unpack > where > decode [] = [] > decode ('&':cs) = case break (== ';') cs of > (ent, ';':rest) > | not (null ent), Just dec <- lookupEntity ent -> dec ++ decode rest > _ -> '&' : decode cs > decode (c:cs) = c : decode cs Gophermap line builders + sanitisation -------------------------------------- > putLine :: T.Text -> IO () > putLine t = TIO.putStr (t <> "\r\n") >> hFlush stdout > > -- | >>> infoLine "hello" > -- "ihello\t\t\t0" > infoLine :: T.Text -> T.Text > infoLine msg = "i" <> sanitize msg <> "\t\t\t0" > > -- | >>> menuRow '1' "home" "/" "host" "70" > -- "1home\t/\thost\t70" > menuRow :: Char -> T.Text -> T.Text -> T.Text -> T.Text -> T.Text > menuRow t display selector host port = > T.singleton t <> sanitize display <> "\t" <> selector > <> "\t" <> host <> "\t" <> port > > -- | >>> searchLine "Search" "/q" "host" "70" > -- "7Search\t/q\thost\t70" > searchLine :: T.Text -> T.Text -> T.Text -> T.Text -> T.Text > searchLine display sel host port = > "7" <> sanitize display <> "\t" <> sel <> "\t" <> host <> "\t" <> port > > -- | >>> errorItem "nope" > -- "3nope\t\t\t0" > errorItem :: T.Text -> T.Text > errorItem msg = "3" <> sanitize msg <> "\t\t\t0" > > terminator :: T.Text > terminator = "." > > -- | >>> sanitize "a\tb\nc" > -- "a b c" > sanitize :: T.Text -> T.Text > sanitize = T.map (\c -> if c == '\r' || c == '\n' || c == '\t' then ' ' else c)