#!/usr/bin/env stack > -- stack script --resolver lts-22.6 --package text --package bytestring --package process --package tagsoup --package network-uri --package time atomize.lhs: turn a web page into an Atom feed over gopher ========================================================== Feed it the URL of a web page; it hands back an Atom 1.0 feed, derived by a fixed, deterministic ladder --- no LLM, no guessing that changes run to run. If it can't build one it says so, and on the raw endpoint a failure is itself a valid one-entry Atom feed (so `curl ... | xmllint` still parses, and a reader still shows *something*). curl gopher://gopher.someodd.zip/0/applets/atomize.lhs/example.com/blog > feed.xml (This file is markdown-flavoured literate Haskell. Headings use setext underlines rather than ATX-style `#` because GHC's literate parser reads a `#` at column 1 of a non-code line as a pragma --- setext sidesteps that.) The protocol point that makes this work --------------------------------------- A gopher item-type byte (the `0` in `/0/applets/...`) is a *client* rendering hint; it is never sent to the server. The server only ever sees the selector (plus a tab-query for type-7 searches). So we can: * keep the type-7 search box returning a **menu** (a "report" page), exactly as the protocol expects; and * serve the **raw Atom feed** at its own selector, reached as a type-0 (text document) link --- which an Atom feed legitimately is. The page URL rides in the selector as path-info. Same shape as `fewbytes/yt.lhs` (`/yt.lhs//audio` -> raw bytes). URL design ---------- `$SCRIPT` is wherever `routes.toml` mounts this file; the script computes its own mount point from `$selector - $pathinfo`. $SCRIPT landing menu (search prompt + docs) $SCRIPT + type-7 query a "report" menu: how the feed was derived + a type-0 link and a one-shot curl command for the raw feed $SCRIPT/ the raw Atom feed (one request) The path-info form accepts either a percent-encoded URL as one segment (what the report page emits) or a bare `host/path` that defaults to `https://`. The type-7 box is the most forgiving input: the query is sent raw after a tab, so you can paste any URL verbatim, scheme/query-string/and all, and it hands you back the encoded one-liner. The deterministic ladder ------------------------- 0. SSRF guard --- http/https only; refuse localhost, RFC-1918, link-local, and the cloud-metadata address. (This box also runs Ollama on localhost:11434, Icecast, etc.) curl is capped on time, redirects, and response size. 1. Sniff the fetched body. If it IS a feed: Atom is returned verbatim; RSS/RDF is converted to Atom. 2. Else autodiscover `` in the HTML, resolve it, fetch, return/convert. 3. Else probe a fixed list of common feed paths on the host. 4. Else synthesise a feed from the page's heading/article/list links (deduped, same-origin, capped) --- labelled as scraped. 5. Else fail with a clear reason. Determinism note: nothing here reads the wall clock. Atom is passed through verbatim; RSS dates are parsed from the source and reformatted; entries with no parseable date get a fixed sentinel (1970-01-01T00:00:00Z). Identical input -> identical bytes. Running the doctests -------------------- doctest-lhs atomize.lhs Module header and imports ------------------------- > {-# LANGUAGE OverloadedStrings #-} > module Main (main) where > > import Control.Exception (SomeException, try) > import Data.Char (isDigit) > import Data.List (find) > import Data.Maybe (fromMaybe, listToMaybe, mapMaybe) > import qualified Data.Text as T > import qualified Data.Text.IO as TIO > import Data.Time (UTCTime, defaultTimeLocale, formatTime, > parseTimeM) > import Network.URI (parseURI, parseURIReference, relativeTo, > uriAuthority, uriRegName, uriScheme, > uriToString, escapeURIString, > isUnreserved, unEscapeString) > import System.Environment (getArgs, lookupEnv) > import System.Exit (ExitCode (..)) > import System.IO (BufferMode (..), hFlush, hSetBuffering, > hSetEncoding, stdout, utf8) > import System.Process (readProcessWithExitCode) > import Text.HTML.TagSoup (Tag (..), parseTags) Defaults -------- Host/port overridable via `GOPHER_HOST` / `GOPHER_PORT` so the same script serves staging and production unedited. > defaultHost, defaultPort :: T.Text > defaultHost = "gopher.someodd.zip" > defaultPort = "70" > > -- | Atom requires a timestamp on the feed and every entry; when the > -- source has none we use this fixed value rather than the wall > -- clock, keeping output deterministic. > sentinelDate :: T.Text > sentinelDate = "1970-01-01T00:00:00Z" > > -- | Hard cap on synthesised entries, so a huge link list can't > -- produce a runaway feed. > maxEntries :: Int > maxEntries = 50 > > -- | Tried in order when no feed is declared. Resolved against the > -- target's scheme://host. First one that sniffs as a feed wins. > probePaths :: [T.Text] > probePaths = > [ "/feed", "/feed/", "/rss", "/rss.xml", "/atom.xml" > , "/feed.xml", "/index.xml", "/feeds/posts/default", "/?feed=rss2" ] Request parsing --------------- Four positional argv from Venusia (`$selector`, `$search`, `$pathinfo`, `$remote_ip`); we ignore the IP via the cons pattern. > data Req = Req > { reqSel :: T.Text > , reqQ :: T.Text > , reqP :: T.Text > } deriving Show > > parseArgs :: [String] -> Req > parseArgs (s:q:p:_) = Req (T.pack s) (T.pack q) (T.pack p) > parseArgs (s:q:_) = Req (T.pack s) (T.pack q) T.empty > parseArgs [s] = Req (T.pack s) T.empty T.empty > parseArgs [] = error > "atomize.lhs: missing argv[0] (gopher selector). Run via Venusia, \ > \or for manual testing pass selector + query/pathinfo explicitly." > > data Ctx = Ctx > { ctxScriptSel :: T.Text > , ctxHost :: T.Text > , ctxPort :: T.Text > } Main dispatch ------------- `stdout` is `NoBuffering` per house convention; the body is wrapped in `try` so any uncaught exception becomes a visible type-3 row (report path) rather than a blank response. The raw path's failures are handled inside `emitRaw` as an error-feed, so they never reach this catch. > main :: IO () > main = do > hSetBuffering stdout NoBuffering > hSetEncoding stdout utf8 > result <- try mainBody :: IO (Either SomeException ()) > case result of > Right () -> pure () > Left e -> mapM_ putLine > [ errorItem ("atomize.lhs crashed: " <> T.pack (show e)) > , terminator > ] > > mainBody :: IO () > mainBody = do > req <- parseArgs <$> getArgs > host <- maybe defaultHost T.pack <$> lookupEnv "GOPHER_HOST" > port <- maybe defaultPort T.pack <$> lookupEnv "GOPHER_PORT" > let pinfo = reqP req > scriptSel = T.dropEnd (T.length pinfo) (reqSel req) > ctx = Ctx scriptSel host port > q = T.strip (reqQ req) > -- The page URL rides in path-info as ONE opaque string; we > -- never split it on '/', so a URL's own slashes survive. > pTarget = T.dropWhile (== '/') pinfo > if not (T.null pTarget) > then emitRaw (normalizeTargetUrl (T.pack (unEscapeString (T.unpack pTarget)))) > else if T.null q > then emitLanding ctx > else emitReport ctx (normalizeTargetUrl q) URL normalisation ----------------- > -- | Trim, repair a single-slashed scheme, and default a scheme-less > -- input to https. Decoding from path-info happens before this. > -- > -- >>> normalizeTargetUrl " example.com/blog " > -- "https://example.com/blog" > -- > -- >>> normalizeTargetUrl "http://example.com" > -- "http://example.com" > -- > -- >>> normalizeTargetUrl "https:/example.com" > -- "https://example.com" > normalizeTargetUrl :: T.Text -> T.Text > normalizeTargetUrl raw = > let s = repairScheme (T.strip raw) > in if hasScheme s then s else "https://" <> s > > -- | >>> hasScheme "http://x" > -- True > -- > -- >>> hasScheme "x.com" > -- False > hasScheme :: T.Text -> Bool > hasScheme s = let l = T.toLower s > in "http://" `T.isPrefixOf` l || "https://" `T.isPrefixOf` l > > -- | Repair a scheme whose `//` got collapsed to `/` (happens when a > -- URL is typed unencoded into path-info and empty segments drop). > -- > -- >>> repairScheme "http:/example.com" > -- "http://example.com" > -- > -- >>> repairScheme "https://ok.com" > -- "https://ok.com" > repairScheme :: T.Text -> T.Text > repairScheme s > | "https://" `T.isPrefixOf` l = s > | "http://" `T.isPrefixOf` l = s > | "https:/" `T.isPrefixOf` l = "https://" <> T.drop 7 s > | "http:/" `T.isPrefixOf` l = "http://" <> T.drop 6 s > | otherwise = s > where l = T.toLower s SSRF guard ---------- > -- | The host component of a (normalised) URL, lower-cased. > -- > -- >>> hostOf "https://Example.com/x" > -- Just "example.com" > -- > -- >>> hostOf "not a url" > -- Nothing > hostOf :: T.Text -> Maybe T.Text > hostOf u = do > uri <- parseURI (T.unpack (normalizeTargetUrl u)) > auth <- uriAuthority uri > let h = T.toLower (T.pack (uriRegName auth)) > if T.null h then Nothing else Just h > > -- | True for hosts we refuse to fetch: localhost, RFC-1918, CGNAT, > -- link-local, the cloud-metadata IP, and `.local` mDNS names. > -- > -- >>> isBlockedHost "localhost" > -- True > -- > -- >>> isBlockedHost "192.168.1.10" > -- True > -- > -- >>> isBlockedHost "169.254.169.254" > -- True > -- > -- >>> isBlockedHost "example.com" > -- False > isBlockedHost :: T.Text -> Bool > isBlockedHost h0 = > let h = T.toLower h0 > in h `elem` ["", "localhost", "0.0.0.0", "::1", "[::1]", "metadata"] > || ".local" `T.isSuffixOf` h > || "127." `T.isPrefixOf` h > || "10." `T.isPrefixOf` h > || "192.168." `T.isPrefixOf` h > || "169.254." `T.isPrefixOf` h > || "100.64." `T.isPrefixOf` h > || in172 h > where > -- 172.16.0.0 .. 172.31.255.255 > in172 h = case T.splitOn "." h of > (a:b:_) | a == "172" > , all isDigit (T.unpack b) > , not (T.null b) > , let n = read (T.unpack b) :: Int > -> n >= 16 && n <= 31 > _ -> False Fetching -------- curl with a tight envelope: http/https only, capped redirects, time, and response size. A non-zero exit (including `--max-filesize` overflow) yields `Nothing`. > fetchUrl :: T.Text -> IO (Maybe T.Text) > fetchUrl url = case hostOf url of > Just h | not (isBlockedHost h) -> do > (ec, out, _) <- readProcessWithExitCode "curl" > [ "-sSL", "--proto", "=http,https", "--max-redirs", "3" > , "--connect-timeout", "5", "--max-time", "15" > , "--max-filesize", "5000000" > , "-A", "atomize.lhs (gopher feed maker)" > , "--", T.unpack url ] "" > pure $ case ec of > ExitSuccess | not (null out) -> Just (T.pack out) > _ -> Nothing > _ -> pure Nothing Document sniffing ----------------- Which kind of document is this? We find the first of the recognised root tokens to appear and key off that, so a stray `` mention deep in an HTML body doesn't masquerade as an Atom root. > data DocKind = KAtom | KRss | KRdf | KHtml deriving (Eq, Show) > > -- | >>> sniffDoc "" > -- KAtom > -- > -- >>> sniffDoc "" > -- KRss > -- > -- >>> sniffDoc "" > -- KHtml > -- > -- >>> sniffDoc "" > -- KRdf > sniffDoc :: T.Text -> DocKind > sniffDoc t = > let l = T.toLower t > posOf pat = let (pre, m) = T.breakOn pat l > in if T.null m then Nothing else Just (T.length pre) > cands = [ (KAtom, " , (KHtml, " hits = [ (k, p) | (k, pat) <- cands, Just p <- [posOf pat] ] > in case hits of > [] -> KHtml > _ -> fst (minimumByPos hits) > where minimumByPos = foldr1 (\a b -> if snd a <= snd b then a else b) Tag-soup helpers ---------------- Lenient lookups over a flat `[Tag Text]`, shared by discovery, conversion, and scraping. > lc :: T.Text -> T.Text > lc = T.toLower > > isOpen, isClose :: T.Text -> Tag T.Text -> Bool > isOpen n (TagOpen t _) = lc t == n > isOpen _ _ = False > isClose n (TagClose t) = lc t == n > isClose _ _ = False > > -- | Attribute value by (case-insensitive) name; "" if absent. > attrOf :: T.Text -> [(T.Text, T.Text)] -> T.Text > attrOf k attrs = fromMaybe "" (lookup (lc k) [ (lc a, b) | (a, b) <- attrs ]) > > -- | Concatenated text directly inside the first `...`. > innerText :: T.Text -> [Tag T.Text] -> T.Text > innerText name tags = > case dropWhile (not . isOpen name) tags of > (_:rest) -> T.strip . T.concat $ > [ s | TagText s <- takeWhile (not . isClose name) rest ] > [] -> "" > > -- | All `...` chunks in document order (non-nested). > chunksOf :: T.Text -> [Tag T.Text] -> [[Tag T.Text]] > chunksOf name = go > where > go ts = case dropWhile (not . isOpen name) ts of > [] -> [] > (_:rest) -> takeWhile (not . isClose name) rest > : go (dropWhile (not . isClose name) rest) Feed autodiscovery ------------------ > -- | Declared alternate feeds as (kind, raw-href), Atom-typed first. > discoverFeedLinks :: [Tag T.Text] -> [(DocKind, T.Text)] > discoverFeedLinks tags = > let links = [ (ty, attrOf "href" attrs) > | TagOpen n attrs <- tags, lc n == "link" > , let ty = lc (attrOf "type" attrs) > , ty == "application/atom+xml" || ty == "application/rss+xml" > , not (T.null (attrOf "href" attrs)) ] > atoms = [ (KAtom, h) | (ty, h) <- links, ty == "application/atom+xml" ] > rsses = [ (KRss, h) | (ty, h) <- links, ty == "application/rss+xml" ] > in atoms ++ rsses > > -- | Resolve a possibly-relative href against a base URL. > -- > -- >>> resolveUrl "https://a.com/blog/" "feed.xml" > -- "https://a.com/blog/feed.xml" > -- > -- >>> resolveUrl "https://a.com/blog/" "/atom" > -- "https://a.com/atom" > resolveUrl :: T.Text -> T.Text -> T.Text > resolveUrl base href = > case (parseURIReference (T.unpack href), parseURI (T.unpack base)) of > (Just r, Just b) -> T.pack (uriToString id (r `relativeTo` b) "") > _ -> href RSS / RDF -> Atom conversion ---------------------------- We take the first ``/`<link>` as the channel's, then map each `<item>` to an `<entry>`. Dates are parsed from the source and reformatted to RFC-3339; unparseable ones fall back to the sentinel. > rssToAtom :: T.Text -> T.Text -> T.Text > rssToAtom srcUrl body = > let tags = parseTags body > chTitle = firstNonEmpty [innerText "title" tags, "Untitled feed"] > chLink = firstNonEmpty [innerText "link" tags, srcUrl] > items = chunksOf "item" tags > entries = take maxEntries (map (itemToEntry srcUrl) items) > fUpdated = firstNonEmpty (map enUpdated entries ++ [sentinelDate]) > meta = Meta chTitle chLink chLink fUpdated > in buildAtom meta entries > > itemToEntry :: T.Text -> [Tag T.Text] -> Entry > itemToEntry srcUrl chunk = > let title = firstNonEmpty [innerText "title" chunk, "(untitled)"] > link = firstNonEmpty [innerText "link" chunk, innerText "guid" chunk] > gid = firstNonEmpty [innerText "guid" chunk, link, srcUrl] > date = toAtomDate (firstNonEmpty > [innerText "pubdate" chunk, innerText "dc:date" chunk]) > desc = innerText "description" chunk > in Entry title link gid date (if T.null desc then Nothing else Just desc) Synthesising from page links ----------------------------- When nothing is declared and nothing is at a common path, we build entries from anchors that sit inside heading / article / list-item context (which filters most nav/footer chrome), deduped by resolved href and kept same-origin. > scrapeEntries :: T.Text -> T.Text -> [Entry] > scrapeEntries base body = > let baseHost = fromMaybe "" (hostOf base) > raw = go 0 Nothing (parseTags body) > resolved = [ (resolveUrl base h, T.unwords (T.words txt)) > | (h, txt) <- raw, not (T.null h), not (T.null (T.strip txt)) ] > sameHost = [ (h, txt) | (h, txt) <- resolved > , hostOf h == Just baseHost ] > deduped = dedupeByFst sameHost > in take maxEntries > [ Entry txt h h sentinelDate Nothing | (h, txt) <- deduped ] > where > content = ["h1", "h2", "h3", "article", "li"] > -- depth = how many content containers are currently open; > -- cur = (href, text-so-far) while inside a captured anchor. > go :: Int -> Maybe (T.Text, T.Text) -> [Tag T.Text] > -> [(T.Text, T.Text)] > go _ _ [] = [] > go depth cur (tag:rest) = case tag of > TagOpen n attrs > | lc n == "a" && depth > 0 && cur == Nothing -> > go depth (Just (attrOf "href" attrs, "")) rest > | lc n `elem` content -> go (depth + 1) cur rest > | otherwise -> go depth cur rest > TagText s -> case cur of > Just (h, acc) -> go depth (Just (h, acc <> s)) rest > Nothing -> go depth cur rest > TagClose n > | lc n == "a" -> case cur of > Just (h, acc) -> (h, acc) : go depth Nothing rest > Nothing -> go depth cur rest > | lc n `elem` content -> go (max 0 (depth - 1)) cur rest > | otherwise -> go depth cur rest > _ -> go depth cur rest Atom emission ------------- We build the XML as text by hand --- we control every field, so a serialiser would be ceremony. Every interpolated value is escaped. > data Meta = Meta > { fmTitle :: T.Text, fmId :: T.Text, fmLink :: T.Text, fmUpdated :: T.Text } > > data Entry = Entry > { enTitle :: T.Text > , enLink :: T.Text > , enId :: T.Text > , enUpdated :: T.Text > , enSummary :: Maybe T.Text > } > > buildAtom :: Meta -> [Entry] -> T.Text > buildAtom meta entries = T.concat $ > [ "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" > , "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n" > , el "title" (fmTitle meta) > , el "id" (nonEmptyId (fmId meta)) > , el "updated" (fmUpdated meta) > , linkEl (fmLink meta) > , " <generator>atomize.lhs</generator>\n" > ] ++ map entryXml entries ++ [ "</feed>\n" ] > where > el name v = " <" <> name <> ">" <> xmlEscape v <> "</" <> name <> ">\n" > linkEl h > | T.null h = "" > | otherwise = " <link rel=\"alternate\" href=\"" <> xmlEscape h <> "\"/>\n" > entryXml e = T.concat > [ " <entry>\n " > , el "title" (enTitle e) > , " ", el "id" (nonEmptyId (firstNonEmpty [enId e, enLink e])) > , " ", el "updated" (enUpdated e) > , " ", linkEl (enLink e) > , maybe "" (\s -> " " <> el "summary" s) (enSummary e) > , " </entry>\n" ] > -- Atom <id> must be an IRI; fall back to a tag: URI when blank. > nonEmptyId i = if T.null i then "tag:atomize.lhs,1970:empty" else i > > -- | Escape the five XML-significant characters. > -- > -- >>> xmlEscape "a<b>&\"'" > -- "a<b>&"'" > xmlEscape :: T.Text -> T.Text > xmlEscape = T.concatMap esc > where > esc '<' = "<" > esc '>' = ">" > esc '&' = "&" > esc '"' = """ > esc '\'' = "'" > esc c = T.singleton c Date handling ------------- > -- | Reformat a source date to RFC-3339, or the sentinel if no known > -- format parses. Pure (parses the given text; never reads the > -- clock), so output stays deterministic. > -- > -- >>> toAtomDate "Mon, 06 Sep 2021 16:45:00 +0000" > -- "2021-09-06T16:45:00Z" > -- > -- >>> toAtomDate "2021-09-06T16:45:00Z" > -- "2021-09-06T16:45:00Z" > -- > -- >>> toAtomDate "not a date" > -- "1970-01-01T00:00:00Z" > toAtomDate :: T.Text -> T.Text > toAtomDate s > | T.null (T.strip s) = sentinelDate > | otherwise = case parsed of > Just u -> T.pack (formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%SZ" u) > Nothing -> sentinelDate > where > str = T.unpack (T.strip s) > parsed = listToMaybe $ mapMaybe tryFmt fmts > tryFmt f = parseTimeM True defaultTimeLocale f str :: Maybe UTCTime > fmts = > [ "%a, %d %b %Y %H:%M:%S %z" > , "%a, %d %b %Y %H:%M:%S %Z" > , "%Y-%m-%dT%H:%M:%S%Z" > , "%Y-%m-%dT%H:%M:%S%z" > , "%Y-%m-%dT%H:%M:%SZ" > , "%Y-%m-%d" ] The deriving step (IO) ---------------------- Walks the ladder and reports both the resulting feed (if any) and a human note describing which rung produced it. > data Outcome = Outcome > { ocNote :: T.Text -- ^ how it went, for the report page > , ocFeed :: Maybe T.Text -- ^ the Atom feed on success > } > > deriveFeed :: T.Text -> IO Outcome > deriveFeed url = case hostOf url of > Nothing -> pure (Outcome "could not parse that as an http(s) URL" Nothing) > Just h | isBlockedHost h -> > pure (Outcome ("refused: \"" <> h <> "\" is a local/private address") Nothing) > Just _ -> do > mBody <- fetchUrl url > case mBody of > Nothing -> pure (Outcome "could not fetch the page (timeout, too big, or error)" Nothing) > Just body -> case sniffDoc body of > KAtom -> pure (Outcome "the URL is already an Atom feed (returned as-is)" > (Just body)) > KRss -> pure (Outcome "the URL is an RSS feed (converted to Atom)" > (Just (rssToAtom url body))) > KRdf -> pure (Outcome "the URL is an RDF/RSS feed (converted to Atom)" > (Just (rssToAtom url body))) > KHtml -> fromHtml url body > > -- | Tiers 2-4 for an HTML page. > fromHtml :: T.Text -> T.Text -> IO Outcome > fromHtml url body = do > let tags = parseTags body > declared <- tryDeclared url (discoverFeedLinks tags) > case declared of > Just o -> pure o > Nothing -> do > probed <- tryProbe url > case probed of > Just o -> pure o > Nothing -> case scrapeEntries url body of > [] -> pure (Outcome > "no declared feed, nothing at common feed paths, and no \ > \heading/list link structure to synthesise from" Nothing) > entries -> > let title = firstNonEmpty [innerText "title" tags, url] > meta = Meta title url url > (firstNonEmpty (map enUpdated entries ++ [sentinelDate])) > note = "no feed declared; synthesised " > <> T.pack (show (length entries)) > <> " entries from page links (may be noisy)" > in pure (Outcome note (Just (buildAtom meta entries))) > > -- | Fetch the first declared feed that resolves and parses. > tryDeclared :: T.Text -> [(DocKind, T.Text)] -> IO (Maybe Outcome) > tryDeclared _ [] = pure Nothing > tryDeclared base ((kind, href):more) = do > let abs' = resolveUrl base href > mBody <- fetchUrl abs' > case mBody of > Nothing -> tryDeclared base more > Just body -> case sniffDoc body of > KAtom -> pure (Just (Outcome ("found a declared Atom feed: " <> abs') (Just body))) > KRss -> pure (Just (Outcome ("found a declared RSS feed (converted): " <> abs') > (Just (rssToAtom abs' body)))) > KRdf -> pure (Just (Outcome ("found a declared RDF feed (converted): " <> abs') > (Just (rssToAtom abs' body)))) > KHtml | kind == KAtom -> pure (Just (Outcome > ("declared Atom feed returned non-feed content: " <> abs') Nothing)) > | otherwise -> tryDeclared base more > > -- | Probe the fixed common-path list on the target's host. > tryProbe :: T.Text -> IO (Maybe Outcome) > tryProbe url = case rootOf url of > Nothing -> pure Nothing > Just root -> go probePaths > where > go [] = pure Nothing > go (p:ps) = do > let cand = root <> p > mBody <- fetchUrl cand > case mBody >>= \b -> Just (sniffDoc b, b) of > Just (KAtom, b) -> pure (Just (Outcome ("found a feed by probing " <> p) (Just b))) > Just (KRss, b) -> pure (Just (Outcome ("found an RSS feed by probing " <> p <> " (converted)") > (Just (rssToAtom cand b)))) > Just (KRdf, b) -> pure (Just (Outcome ("found an RDF feed by probing " <> p <> " (converted)") > (Just (rssToAtom cand b)))) > _ -> go ps > > -- | scheme://host of a URL (no path), for probing. > -- > -- >>> rootOf "https://a.com/blog/post?x=1" > -- Just "https://a.com" > rootOf :: T.Text -> Maybe T.Text > rootOf url = do > uri <- parseURI (T.unpack (normalizeTargetUrl url)) > auth <- uriAuthority uri > -- uriScheme keeps its trailing colon, e.g. "https:" > pure (T.pack (uriScheme uri) <> "//" <> T.pack (uriRegName auth)) Raw endpoint ------------ The hero path: emit the Atom feed as raw bytes, no gopher terminator. On failure, still emit a valid one-entry Atom feed whose entry carries the reason --- so a reader/`xmllint` always parses a real feed. > emitRaw :: T.Text -> IO () > emitRaw url = do > Outcome note mFeed <- deriveFeed url > case mFeed of > Just feed -> TIO.putStr feed > Nothing -> TIO.putStr (errorFeed url note) > > errorFeed :: T.Text -> T.Text -> T.Text > errorFeed url note = buildAtom > (Meta ("atomize: could not build a feed for " <> url) > ("tag:atomize.lhs,1970:error") url sentinelDate) > [ Entry ("Could not build an Atom feed for " <> url) url > ("tag:atomize.lhs,1970:error") sentinelDate (Just note) ] Report page (type-7 result -> a menu) ------------------------------------- > emitReport :: Ctx -> T.Text -> IO () > emitReport ctx url = do > Outcome note mFeed <- deriveFeed url > putLine (infoLine "atomize -- turn a web page into an Atom feed.") > putLine (infoLine ("Target: " <> url)) > putLine (infoLine "") > case mFeed of > Just _ -> do > let enc = percentEncodeUrl url > rawSel = ctxScriptSel ctx <> "/" <> enc > portPart = if ctxPort ctx == "70" then "" else ":" <> ctxPort ctx > rawUri = "gopher://" <> ctxHost ctx <> portPart > <> "/0" <> rawSel > putLine (infoLine note) > putLine (infoLine "") > putLine (menuRow '0' "Atom feed -- open or save" rawSel > (ctxHost ctx) (ctxPort ctx)) > putLine (infoLine "One-shot from a shell:") > putLine (infoLine (" curl " <> rawUri <> " > feed.xml")) > Nothing -> do > putLine (errorItem ("Failed: " <> note)) > putLine (infoLine "") > putLine (infoLine "Try a more specific page (a blog index, a") > putLine (infoLine "category page) or a known feed URL directly.") > mapM_ putLine (footerRows ctx) > putLine terminator Landing page ------------ > emitLanding :: Ctx -> IO () > emitLanding ctx = do > let portPart = if ctxPort ctx == "70" then "" else ":" <> ctxPort ctx > base = "gopher://" <> ctxHost ctx <> portPart <> "/0" <> ctxScriptSel ctx > mapM_ putLine > [ infoLine "atomize -- turn any web page into an Atom feed." > , infoLine "Deterministic: discover a declared feed, convert RSS," > , infoLine "probe common paths, or synthesise from page links." > , infoLine "" > , searchLine "Paste a web-page URL" (ctxScriptSel ctx) (ctxHost ctx) (ctxPort ctx) > , infoLine "" > , infoLine "Or go straight to the raw feed from a shell:" > , infoLine (" curl " <> base <> "/example.com/blog") > , infoLine "(the URL is the selector; bare host/path defaults to https)" > ] > mapM_ putLine (footerRows ctx) > putLine terminator > > footerRows :: Ctx -> [T.Text] > footerRows ctx = > [ infoLine "" > , menuRow '1' "-- served over gopher * no js * no ads * no tracking --" > "" (ctxHost ctx) (ctxPort ctx) ] Selector encoding ----------------- The page URL becomes one opaque path-info segment: escape everything but the URI-unreserved set, so its slashes and colon survive intact. > -- | >>> percentEncodeUrl "https://a.com/b?x=1" > -- "https%3A%2F%2Fa.com%2Fb%3Fx%3D1" > percentEncodeUrl :: T.Text -> T.Text > percentEncodeUrl = T.pack . escapeURIString isUnreserved . T.unpack Gophermap line builders + sanitisation -------------------------------------- > putLine :: T.Text -> IO () > putLine t = TIO.putStr (t <> "\r\n") >> hFlush stdout > > -- | >>> infoLine "hello" > -- "ihello\t\t\t0" > infoLine :: T.Text -> T.Text > infoLine msg = "i" <> sanitize msg <> "\t\t\t0" > > -- | >>> menuRow '1' "home" "/" "host" "70" > -- "1home\t/\thost\t70" > menuRow :: Char -> T.Text -> T.Text -> T.Text -> T.Text -> T.Text > menuRow t display selector host port = > T.singleton t <> sanitize display <> "\t" <> selector > <> "\t" <> host <> "\t" <> port > > -- | >>> searchLine "Search" "/q" "host" "70" > -- "7Search\t/q\thost\t70" > searchLine :: T.Text -> T.Text -> T.Text -> T.Text -> T.Text > searchLine display sel host port = > "7" <> sanitize display <> "\t" <> sel <> "\t" <> host <> "\t" <> port > > -- | >>> errorItem "nope" > -- "3nope\t\t\t0" > errorItem :: T.Text -> T.Text > errorItem msg = "3" <> sanitize msg <> "\t\t\t0" > > terminator :: T.Text > terminator = "." > > -- | >>> sanitize "a\tb\nc" > -- "a b c" > sanitize :: T.Text -> T.Text > sanitize = T.map (\c -> if c == '\r' || c == '\n' || c == '\t' then ' ' else c) Small pure utilities -------------------- > -- | First non-empty (after trimming) text, or "" if all blank. > -- > -- >>> firstNonEmpty ["", " ", "x", "y"] > -- "x" > firstNonEmpty :: [T.Text] -> T.Text > firstNonEmpty = fromMaybe "" . find (not . T.null) . map T.strip > > -- | Keep the first occurrence of each first-component, order-stable. > -- > -- >>> dedupeByFst [("a",1),("b",2),("a",3)] > -- [("a",1),("b",2)] > dedupeByFst :: Eq a => [(a, b)] -> [(a, b)] > dedupeByFst xs = go [] xs > where > go _ [] = [] > go seen ((k,v):rest) > | k `elem` seen = go seen rest > | otherwise = (k,v) : go (k:seen) rest