#!/usr/bin/env stack
> -- stack script --resolver lts-22.6 --package text --package bytestring --package process --package tagsoup --package network-uri --package time

atomize.lhs: turn a web page into an Atom feed over gopher
==========================================================

Feed it the URL of a web page; it hands back an Atom 1.0 feed,
derived by a fixed, deterministic ladder --- no LLM, no guessing
that changes run to run. If it can't build one it says so, and on
the raw endpoint a failure is itself a valid one-entry Atom feed
(so `curl ... | xmllint` still parses, and a reader still shows
*something*).

    curl gopher://gopher.someodd.zip/0/applets/atomize.lhs/example.com/blog > feed.xml

(This file is markdown-flavoured literate Haskell. Headings use
setext underlines rather than ATX-style `#` because GHC's literate
parser reads a `#` at column 1 of a non-code line as a pragma ---
setext sidesteps that.)

The protocol point that makes this work
---------------------------------------

A gopher item-type byte (the `0` in `/0/applets/...`) is a *client*
rendering hint; it is never sent to the server. The server only ever
sees the selector (plus a tab-query for type-7 searches). So we can:

  * keep the type-7 search box returning a **menu** (a "report" page),
    exactly as the protocol expects; and
  * serve the **raw Atom feed** at its own selector, reached as a
    type-0 (text document) link --- which an Atom feed legitimately
    is. The page URL rides in the selector as path-info.

Same shape as `fewbytes/yt.lhs` (`/yt.lhs/<id>/audio` -> raw bytes).

URL design
----------

`$SCRIPT` is wherever `routes.toml` mounts this file; the script
computes its own mount point from `$selector - $pathinfo`.

    $SCRIPT                    landing menu (search prompt + docs)
    $SCRIPT  + type-7 query    a "report" menu: how the feed was
                               derived + a type-0 link and a one-shot
                               curl command for the raw feed
    $SCRIPT/<page-url>         the raw Atom feed (one request)

The path-info form accepts either a percent-encoded URL as one
segment (what the report page emits) or a bare `host/path` that
defaults to `https://`. The type-7 box is the most forgiving input:
the query is sent raw after a tab, so you can paste any URL verbatim,
scheme/query-string/and all, and it hands you back the encoded
one-liner.

The deterministic ladder
-------------------------

  0. SSRF guard --- http/https only; refuse localhost, RFC-1918,
     link-local, and the cloud-metadata address. (This box also runs
     Ollama on localhost:11434, Icecast, etc.) curl is capped on
     time, redirects, and response size.
  1. Sniff the fetched body. If it IS a feed: Atom is returned
     verbatim; RSS/RDF is converted to Atom.
  2. Else autodiscover `<link rel=alternate type=...atom+xml|rss+xml>`
     in the HTML, resolve it, fetch, return/convert.
  3. Else probe a fixed list of common feed paths on the host.
  4. Else synthesise a feed from the page's heading/article/list
     links (deduped, same-origin, capped) --- labelled as scraped.
  5. Else fail with a clear reason.

Determinism note: nothing here reads the wall clock. Atom is
passed through verbatim; RSS dates are parsed from the source and
reformatted; entries with no parseable date get a fixed sentinel
(1970-01-01T00:00:00Z). Identical input -> identical bytes.

Running the doctests
--------------------

    doctest-lhs atomize.lhs

Module header and imports
-------------------------

> {-# LANGUAGE OverloadedStrings #-}
> module Main (main) where
>
> import           Control.Exception   (SomeException, try)
> import           Data.Char           (isDigit)
> import           Data.List           (find)
> import           Data.Maybe          (fromMaybe, listToMaybe, mapMaybe)
> import qualified Data.Text           as T
> import qualified Data.Text.IO        as TIO
> import           Data.Time           (UTCTime, defaultTimeLocale, formatTime,
>                                       parseTimeM)
> import           Network.URI         (parseURI, parseURIReference, relativeTo,
>                                       uriAuthority, uriRegName, uriScheme,
>                                       uriToString, escapeURIString,
>                                       isUnreserved, unEscapeString)
> import           System.Environment  (getArgs, lookupEnv)
> import           System.Exit         (ExitCode (..))
> import           System.IO           (BufferMode (..), hFlush, hSetBuffering,
>                                       hSetEncoding, stdout, utf8)
> import           System.Process      (readProcessWithExitCode)
> import           Text.HTML.TagSoup   (Tag (..), parseTags)

Defaults
--------

Host/port overridable via `GOPHER_HOST` / `GOPHER_PORT` so the same
script serves staging and production unedited.

> defaultHost, defaultPort :: T.Text
> defaultHost = "gopher.someodd.zip"
> defaultPort = "70"
>
> -- | Atom requires a timestamp on the feed and every entry; when the
> -- source has none we use this fixed value rather than the wall
> -- clock, keeping output deterministic.
> sentinelDate :: T.Text
> sentinelDate = "1970-01-01T00:00:00Z"
>
> -- | Hard cap on synthesised entries, so a huge link list can't
> -- produce a runaway feed.
> maxEntries :: Int
> maxEntries = 50
>
> -- | Tried in order when no feed is declared. Resolved against the
> -- target's scheme://host. First one that sniffs as a feed wins.
> probePaths :: [T.Text]
> probePaths =
>   [ "/feed", "/feed/", "/rss", "/rss.xml", "/atom.xml"
>   , "/feed.xml", "/index.xml", "/feeds/posts/default", "/?feed=rss2" ]

Request parsing
---------------

Four positional argv from Venusia (`$selector`, `$search`,
`$pathinfo`, `$remote_ip`); we ignore the IP via the cons pattern.

> data Req = Req
>   { reqSel :: T.Text
>   , reqQ   :: T.Text
>   , reqP   :: T.Text
>   } deriving Show
>
> parseArgs :: [String] -> Req
> parseArgs (s:q:p:_) = Req (T.pack s) (T.pack q) (T.pack p)
> parseArgs (s:q:_)   = Req (T.pack s) (T.pack q) T.empty
> parseArgs [s]       = Req (T.pack s) T.empty    T.empty
> parseArgs []        = error
>   "atomize.lhs: missing argv[0] (gopher selector). Run via Venusia, \
>   \or for manual testing pass selector + query/pathinfo explicitly."
>
> data Ctx = Ctx
>   { ctxScriptSel :: T.Text
>   , ctxHost      :: T.Text
>   , ctxPort      :: T.Text
>   }

Main dispatch
-------------

`stdout` is `NoBuffering` per house convention; the body is wrapped
in `try` so any uncaught exception becomes a visible type-3 row
(report path) rather than a blank response. The raw path's failures
are handled inside `emitRaw` as an error-feed, so they never reach
this catch.

> main :: IO ()
> main = do
>   hSetBuffering stdout NoBuffering
>   hSetEncoding  stdout utf8
>   result <- try mainBody :: IO (Either SomeException ())
>   case result of
>     Right () -> pure ()
>     Left e   -> mapM_ putLine
>       [ errorItem ("atomize.lhs crashed: " <> T.pack (show e))
>       , terminator
>       ]
>
> mainBody :: IO ()
> mainBody = do
>   req  <- parseArgs <$> getArgs
>   host <- maybe defaultHost T.pack <$> lookupEnv "GOPHER_HOST"
>   port <- maybe defaultPort T.pack <$> lookupEnv "GOPHER_PORT"
>   let pinfo     = reqP req
>       scriptSel = T.dropEnd (T.length pinfo) (reqSel req)
>       ctx       = Ctx scriptSel host port
>       q         = T.strip (reqQ req)
>       -- The page URL rides in path-info as ONE opaque string; we
>       -- never split it on '/', so a URL's own slashes survive.
>       pTarget   = T.dropWhile (== '/') pinfo
>   if not (T.null pTarget)
>     then emitRaw    (normalizeTargetUrl (T.pack (unEscapeString (T.unpack pTarget))))
>     else if T.null q
>       then emitLanding ctx
>       else emitReport  ctx (normalizeTargetUrl q)

URL normalisation
-----------------

> -- | Trim, repair a single-slashed scheme, and default a scheme-less
> -- input to https. Decoding from path-info happens before this.
> --
> -- >>> normalizeTargetUrl "  example.com/blog "
> -- "https://example.com/blog"
> --
> -- >>> normalizeTargetUrl "http://example.com"
> -- "http://example.com"
> --
> -- >>> normalizeTargetUrl "https:/example.com"
> -- "https://example.com"
> normalizeTargetUrl :: T.Text -> T.Text
> normalizeTargetUrl raw =
>   let s = repairScheme (T.strip raw)
>   in if hasScheme s then s else "https://" <> s
>
> -- | >>> hasScheme "http://x"
> -- True
> --
> -- >>> hasScheme "x.com"
> -- False
> hasScheme :: T.Text -> Bool
> hasScheme s = let l = T.toLower s
>               in "http://" `T.isPrefixOf` l || "https://" `T.isPrefixOf` l
>
> -- | Repair a scheme whose `//` got collapsed to `/` (happens when a
> -- URL is typed unencoded into path-info and empty segments drop).
> --
> -- >>> repairScheme "http:/example.com"
> -- "http://example.com"
> --
> -- >>> repairScheme "https://ok.com"
> -- "https://ok.com"
> repairScheme :: T.Text -> T.Text
> repairScheme s
>   | "https://" `T.isPrefixOf` l = s
>   | "http://"  `T.isPrefixOf` l = s
>   | "https:/"  `T.isPrefixOf` l = "https://" <> T.drop 7 s
>   | "http:/"   `T.isPrefixOf` l = "http://"  <> T.drop 6 s
>   | otherwise                   = s
>   where l = T.toLower s

SSRF guard
----------

> -- | The host component of a (normalised) URL, lower-cased.
> --
> -- >>> hostOf "https://Example.com/x"
> -- Just "example.com"
> --
> -- >>> hostOf "not a url"
> -- Nothing
> hostOf :: T.Text -> Maybe T.Text
> hostOf u = do
>   uri  <- parseURI (T.unpack (normalizeTargetUrl u))
>   auth <- uriAuthority uri
>   let h = T.toLower (T.pack (uriRegName auth))
>   if T.null h then Nothing else Just h
>
> -- | True for hosts we refuse to fetch: localhost, RFC-1918, CGNAT,
> -- link-local, the cloud-metadata IP, and `.local` mDNS names.
> --
> -- >>> isBlockedHost "localhost"
> -- True
> --
> -- >>> isBlockedHost "192.168.1.10"
> -- True
> --
> -- >>> isBlockedHost "169.254.169.254"
> -- True
> --
> -- >>> isBlockedHost "example.com"
> -- False
> isBlockedHost :: T.Text -> Bool
> isBlockedHost h0 =
>   let h = T.toLower h0
>   in  h `elem` ["", "localhost", "0.0.0.0", "::1", "[::1]", "metadata"]
>    || ".local" `T.isSuffixOf` h
>    || "127."     `T.isPrefixOf` h
>    || "10."      `T.isPrefixOf` h
>    || "192.168." `T.isPrefixOf` h
>    || "169.254." `T.isPrefixOf` h
>    || "100.64."  `T.isPrefixOf` h
>    || in172 h
>   where
>     -- 172.16.0.0 .. 172.31.255.255
>     in172 h = case T.splitOn "." h of
>       (a:b:_) | a == "172"
>               , all isDigit (T.unpack b)
>               , not (T.null b)
>               , let n = read (T.unpack b) :: Int
>               -> n >= 16 && n <= 31
>       _       -> False

Fetching
--------

curl with a tight envelope: http/https only, capped redirects, time,
and response size. A non-zero exit (including `--max-filesize`
overflow) yields `Nothing`.

> fetchUrl :: T.Text -> IO (Maybe T.Text)
> fetchUrl url = case hostOf url of
>   Just h | not (isBlockedHost h) -> do
>     (ec, out, _) <- readProcessWithExitCode "curl"
>       [ "-sSL", "--proto", "=http,https", "--max-redirs", "3"
>       , "--connect-timeout", "5", "--max-time", "15"
>       , "--max-filesize", "5000000"
>       , "-A", "atomize.lhs (gopher feed maker)"
>       , "--", T.unpack url ] ""
>     pure $ case ec of
>       ExitSuccess | not (null out) -> Just (T.pack out)
>       _                            -> Nothing
>   _ -> pure Nothing

Document sniffing
-----------------

Which kind of document is this? We find the first of the recognised
root tokens to appear and key off that, so a stray `<feed>` mention
deep in an HTML body doesn't masquerade as an Atom root.

> data DocKind = KAtom | KRss | KRdf | KHtml deriving (Eq, Show)
>
> -- | >>> sniffDoc "<?xml version='1.0'?><feed xmlns='...'>"
> -- KAtom
> --
> -- >>> sniffDoc "<rss version=\"2.0\"><channel>"
> -- KRss
> --
> -- >>> sniffDoc "<!DOCTYPE html><html><head>"
> -- KHtml
> --
> -- >>> sniffDoc "<rdf:RDF xmlns='...'>"
> -- KRdf
> sniffDoc :: T.Text -> DocKind
> sniffDoc t =
>   let l         = T.toLower t
>       posOf pat = let (pre, m) = T.breakOn pat l
>                   in if T.null m then Nothing else Just (T.length pre)
>       cands     = [ (KAtom, "<feed"), (KRss, "<rss"), (KRdf, "<rdf")
>                   , (KHtml, "<html"), (KHtml, "<!doctype html") ]
>       hits      = [ (k, p) | (k, pat) <- cands, Just p <- [posOf pat] ]
>   in case hits of
>        [] -> KHtml
>        _  -> fst (minimumByPos hits)
>   where minimumByPos = foldr1 (\a b -> if snd a <= snd b then a else b)

Tag-soup helpers
----------------

Lenient lookups over a flat `[Tag Text]`, shared by discovery,
conversion, and scraping.

> lc :: T.Text -> T.Text
> lc = T.toLower
>
> isOpen, isClose :: T.Text -> Tag T.Text -> Bool
> isOpen  n (TagOpen t _) = lc t == n
> isOpen  _ _             = False
> isClose n (TagClose t)  = lc t == n
> isClose _ _             = False
>
> -- | Attribute value by (case-insensitive) name; "" if absent.
> attrOf :: T.Text -> [(T.Text, T.Text)] -> T.Text
> attrOf k attrs = fromMaybe "" (lookup (lc k) [ (lc a, b) | (a, b) <- attrs ])
>
> -- | Concatenated text directly inside the first `<name>...</name>`.
> innerText :: T.Text -> [Tag T.Text] -> T.Text
> innerText name tags =
>   case dropWhile (not . isOpen name) tags of
>     (_:rest) -> T.strip . T.concat $
>                   [ s | TagText s <- takeWhile (not . isClose name) rest ]
>     []       -> ""
>
> -- | All `<name>...</name>` chunks in document order (non-nested).
> chunksOf :: T.Text -> [Tag T.Text] -> [[Tag T.Text]]
> chunksOf name = go
>   where
>     go ts = case dropWhile (not . isOpen name) ts of
>       []       -> []
>       (_:rest) -> takeWhile (not . isClose name) rest
>                     : go (dropWhile (not . isClose name) rest)

Feed autodiscovery
------------------

> -- | Declared alternate feeds as (kind, raw-href), Atom-typed first.
> discoverFeedLinks :: [Tag T.Text] -> [(DocKind, T.Text)]
> discoverFeedLinks tags =
>   let links = [ (ty, attrOf "href" attrs)
>               | TagOpen n attrs <- tags, lc n == "link"
>               , let ty = lc (attrOf "type" attrs)
>               , ty == "application/atom+xml" || ty == "application/rss+xml"
>               , not (T.null (attrOf "href" attrs)) ]
>       atoms = [ (KAtom, h) | (ty, h) <- links, ty == "application/atom+xml" ]
>       rsses = [ (KRss,  h) | (ty, h) <- links, ty == "application/rss+xml" ]
>   in atoms ++ rsses
>
> -- | Resolve a possibly-relative href against a base URL.
> --
> -- >>> resolveUrl "https://a.com/blog/" "feed.xml"
> -- "https://a.com/blog/feed.xml"
> --
> -- >>> resolveUrl "https://a.com/blog/" "/atom"
> -- "https://a.com/atom"
> resolveUrl :: T.Text -> T.Text -> T.Text
> resolveUrl base href =
>   case (parseURIReference (T.unpack href), parseURI (T.unpack base)) of
>     (Just r, Just b) -> T.pack (uriToString id (r `relativeTo` b) "")
>     _                -> href

RSS / RDF -> Atom conversion
----------------------------

We take the first `<title>`/`<link>` as the channel's, then map each
`<item>` to an `<entry>`. Dates are parsed from the source and
reformatted to RFC-3339; unparseable ones fall back to the sentinel.

> rssToAtom :: T.Text -> T.Text -> T.Text
> rssToAtom srcUrl body =
>   let tags     = parseTags body
>       chTitle  = firstNonEmpty [innerText "title" tags, "Untitled feed"]
>       chLink   = firstNonEmpty [innerText "link" tags, srcUrl]
>       items    = chunksOf "item" tags
>       entries  = take maxEntries (map (itemToEntry srcUrl) items)
>       fUpdated = firstNonEmpty (map enUpdated entries ++ [sentinelDate])
>       meta     = Meta chTitle chLink chLink fUpdated
>   in buildAtom meta entries
>
> itemToEntry :: T.Text -> [Tag T.Text] -> Entry
> itemToEntry srcUrl chunk =
>   let title = firstNonEmpty [innerText "title" chunk, "(untitled)"]
>       link  = firstNonEmpty [innerText "link" chunk, innerText "guid" chunk]
>       gid   = firstNonEmpty [innerText "guid" chunk, link, srcUrl]
>       date  = toAtomDate (firstNonEmpty
>                 [innerText "pubdate" chunk, innerText "dc:date" chunk])
>       desc  = innerText "description" chunk
>   in Entry title link gid date (if T.null desc then Nothing else Just desc)

Synthesising from page links
-----------------------------

When nothing is declared and nothing is at a common path, we build
entries from anchors that sit inside heading / article / list-item
context (which filters most nav/footer chrome), deduped by resolved
href and kept same-origin.

> scrapeEntries :: T.Text -> T.Text -> [Entry]
> scrapeEntries base body =
>   let baseHost = fromMaybe "" (hostOf base)
>       raw      = go 0 Nothing (parseTags body)
>       resolved = [ (resolveUrl base h, T.unwords (T.words txt))
>                  | (h, txt) <- raw, not (T.null h), not (T.null (T.strip txt)) ]
>       sameHost = [ (h, txt) | (h, txt) <- resolved
>                  , hostOf h == Just baseHost ]
>       deduped  = dedupeByFst sameHost
>   in take maxEntries
>        [ Entry txt h h sentinelDate Nothing | (h, txt) <- deduped ]
>   where
>     content = ["h1", "h2", "h3", "article", "li"]
>     -- depth = how many content containers are currently open;
>     -- cur   = (href, text-so-far) while inside a captured anchor.
>     go :: Int -> Maybe (T.Text, T.Text) -> [Tag T.Text]
>        -> [(T.Text, T.Text)]
>     go _ _ [] = []
>     go depth cur (tag:rest) = case tag of
>       TagOpen n attrs
>         | lc n == "a" && depth > 0 && cur == Nothing ->
>             go depth (Just (attrOf "href" attrs, "")) rest
>         | lc n `elem` content -> go (depth + 1) cur rest
>         | otherwise           -> go depth cur rest
>       TagText s -> case cur of
>         Just (h, acc) -> go depth (Just (h, acc <> s)) rest
>         Nothing       -> go depth cur rest
>       TagClose n
>         | lc n == "a" -> case cur of
>             Just (h, acc) -> (h, acc) : go depth Nothing rest
>             Nothing       -> go depth cur rest
>         | lc n `elem` content -> go (max 0 (depth - 1)) cur rest
>         | otherwise           -> go depth cur rest
>       _ -> go depth cur rest

Atom emission
-------------

We build the XML as text by hand --- we control every field, so a
serialiser would be ceremony. Every interpolated value is escaped.

> data Meta = Meta
>   { fmTitle :: T.Text, fmId :: T.Text, fmLink :: T.Text, fmUpdated :: T.Text }
>
> data Entry = Entry
>   { enTitle   :: T.Text
>   , enLink    :: T.Text
>   , enId      :: T.Text
>   , enUpdated :: T.Text
>   , enSummary :: Maybe T.Text
>   }
>
> buildAtom :: Meta -> [Entry] -> T.Text
> buildAtom meta entries = T.concat $
>   [ "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
>   , "<feed xmlns=\"http://www.w3.org/2005/Atom\">\n"
>   , el "title"   (fmTitle meta)
>   , el "id"      (nonEmptyId (fmId meta))
>   , el "updated" (fmUpdated meta)
>   , linkEl       (fmLink meta)
>   , "  <generator>atomize.lhs</generator>\n"
>   ] ++ map entryXml entries ++ [ "</feed>\n" ]
>   where
>     el name v = "  <" <> name <> ">" <> xmlEscape v <> "</" <> name <> ">\n"
>     linkEl h
>       | T.null h  = ""
>       | otherwise = "  <link rel=\"alternate\" href=\"" <> xmlEscape h <> "\"/>\n"
>     entryXml e = T.concat
>       [ "  <entry>\n  "
>       , el "title"   (enTitle e)
>       , "  ", el "id" (nonEmptyId (firstNonEmpty [enId e, enLink e]))
>       , "  ", el "updated" (enUpdated e)
>       , "  ", linkEl (enLink e)
>       , maybe "" (\s -> "  " <> el "summary" s) (enSummary e)
>       , "  </entry>\n" ]
>     -- Atom <id> must be an IRI; fall back to a tag: URI when blank.
>     nonEmptyId i = if T.null i then "tag:atomize.lhs,1970:empty" else i
>
> -- | Escape the five XML-significant characters.
> --
> -- >>> xmlEscape "a<b>&\"'"
> -- "a&lt;b&gt;&amp;&quot;&apos;"
> xmlEscape :: T.Text -> T.Text
> xmlEscape = T.concatMap esc
>   where
>     esc '<'  = "&lt;"
>     esc '>'  = "&gt;"
>     esc '&'  = "&amp;"
>     esc '"'  = "&quot;"
>     esc '\'' = "&apos;"
>     esc c    = T.singleton c

Date handling
-------------

> -- | Reformat a source date to RFC-3339, or the sentinel if no known
> -- format parses. Pure (parses the given text; never reads the
> -- clock), so output stays deterministic.
> --
> -- >>> toAtomDate "Mon, 06 Sep 2021 16:45:00 +0000"
> -- "2021-09-06T16:45:00Z"
> --
> -- >>> toAtomDate "2021-09-06T16:45:00Z"
> -- "2021-09-06T16:45:00Z"
> --
> -- >>> toAtomDate "not a date"
> -- "1970-01-01T00:00:00Z"
> toAtomDate :: T.Text -> T.Text
> toAtomDate s
>   | T.null (T.strip s) = sentinelDate
>   | otherwise = case parsed of
>       Just u  -> T.pack (formatTime defaultTimeLocale "%Y-%m-%dT%H:%M:%SZ" u)
>       Nothing -> sentinelDate
>   where
>     str    = T.unpack (T.strip s)
>     parsed = listToMaybe $ mapMaybe tryFmt fmts
>     tryFmt f = parseTimeM True defaultTimeLocale f str :: Maybe UTCTime
>     fmts =
>       [ "%a, %d %b %Y %H:%M:%S %z"
>       , "%a, %d %b %Y %H:%M:%S %Z"
>       , "%Y-%m-%dT%H:%M:%S%Z"
>       , "%Y-%m-%dT%H:%M:%S%z"
>       , "%Y-%m-%dT%H:%M:%SZ"
>       , "%Y-%m-%d" ]

The deriving step (IO)
----------------------

Walks the ladder and reports both the resulting feed (if any) and a
human note describing which rung produced it.

> data Outcome = Outcome
>   { ocNote :: T.Text          -- ^ how it went, for the report page
>   , ocFeed :: Maybe T.Text    -- ^ the Atom feed on success
>   }
>
> deriveFeed :: T.Text -> IO Outcome
> deriveFeed url = case hostOf url of
>   Nothing -> pure (Outcome "could not parse that as an http(s) URL" Nothing)
>   Just h | isBlockedHost h ->
>     pure (Outcome ("refused: \"" <> h <> "\" is a local/private address") Nothing)
>   Just _ -> do
>     mBody <- fetchUrl url
>     case mBody of
>       Nothing   -> pure (Outcome "could not fetch the page (timeout, too big, or error)" Nothing)
>       Just body -> case sniffDoc body of
>         KAtom -> pure (Outcome "the URL is already an Atom feed (returned as-is)"
>                                (Just body))
>         KRss  -> pure (Outcome "the URL is an RSS feed (converted to Atom)"
>                                (Just (rssToAtom url body)))
>         KRdf  -> pure (Outcome "the URL is an RDF/RSS feed (converted to Atom)"
>                                (Just (rssToAtom url body)))
>         KHtml -> fromHtml url body
>
> -- | Tiers 2-4 for an HTML page.
> fromHtml :: T.Text -> T.Text -> IO Outcome
> fromHtml url body = do
>   let tags = parseTags body
>   declared <- tryDeclared url (discoverFeedLinks tags)
>   case declared of
>     Just o  -> pure o
>     Nothing -> do
>       probed <- tryProbe url
>       case probed of
>         Just o  -> pure o
>         Nothing -> case scrapeEntries url body of
>           []      -> pure (Outcome
>             "no declared feed, nothing at common feed paths, and no \
>             \heading/list link structure to synthesise from" Nothing)
>           entries ->
>             let title = firstNonEmpty [innerText "title" tags, url]
>                 meta  = Meta title url url
>                           (firstNonEmpty (map enUpdated entries ++ [sentinelDate]))
>                 note  = "no feed declared; synthesised "
>                          <> T.pack (show (length entries))
>                          <> " entries from page links (may be noisy)"
>             in pure (Outcome note (Just (buildAtom meta entries)))
>
> -- | Fetch the first declared feed that resolves and parses.
> tryDeclared :: T.Text -> [(DocKind, T.Text)] -> IO (Maybe Outcome)
> tryDeclared _    []                 = pure Nothing
> tryDeclared base ((kind, href):more) = do
>   let abs' = resolveUrl base href
>   mBody <- fetchUrl abs'
>   case mBody of
>     Nothing   -> tryDeclared base more
>     Just body -> case sniffDoc body of
>       KAtom -> pure (Just (Outcome ("found a declared Atom feed: " <> abs') (Just body)))
>       KRss  -> pure (Just (Outcome ("found a declared RSS feed (converted): " <> abs')
>                                    (Just (rssToAtom abs' body))))
>       KRdf  -> pure (Just (Outcome ("found a declared RDF feed (converted): " <> abs')
>                                    (Just (rssToAtom abs' body))))
>       KHtml | kind == KAtom -> pure (Just (Outcome
>                 ("declared Atom feed returned non-feed content: " <> abs') Nothing))
>             | otherwise     -> tryDeclared base more
>
> -- | Probe the fixed common-path list on the target's host.
> tryProbe :: T.Text -> IO (Maybe Outcome)
> tryProbe url = case rootOf url of
>   Nothing   -> pure Nothing
>   Just root -> go probePaths
>     where
>       go [] = pure Nothing
>       go (p:ps) = do
>         let cand = root <> p
>         mBody <- fetchUrl cand
>         case mBody >>= \b -> Just (sniffDoc b, b) of
>           Just (KAtom, b) -> pure (Just (Outcome ("found a feed by probing " <> p) (Just b)))
>           Just (KRss,  b) -> pure (Just (Outcome ("found an RSS feed by probing " <> p <> " (converted)")
>                                                  (Just (rssToAtom cand b))))
>           Just (KRdf,  b) -> pure (Just (Outcome ("found an RDF feed by probing " <> p <> " (converted)")
>                                                  (Just (rssToAtom cand b))))
>           _               -> go ps
>
> -- | scheme://host of a URL (no path), for probing.
> --
> -- >>> rootOf "https://a.com/blog/post?x=1"
> -- Just "https://a.com"
> rootOf :: T.Text -> Maybe T.Text
> rootOf url = do
>   uri  <- parseURI (T.unpack (normalizeTargetUrl url))
>   auth <- uriAuthority uri
>   -- uriScheme keeps its trailing colon, e.g. "https:"
>   pure (T.pack (uriScheme uri) <> "//" <> T.pack (uriRegName auth))

Raw endpoint
------------

The hero path: emit the Atom feed as raw bytes, no gopher
terminator. On failure, still emit a valid one-entry Atom feed whose
entry carries the reason --- so a reader/`xmllint` always parses a
real feed.

> emitRaw :: T.Text -> IO ()
> emitRaw url = do
>   Outcome note mFeed <- deriveFeed url
>   case mFeed of
>     Just feed -> TIO.putStr feed
>     Nothing   -> TIO.putStr (errorFeed url note)
>
> errorFeed :: T.Text -> T.Text -> T.Text
> errorFeed url note = buildAtom
>   (Meta ("atomize: could not build a feed for " <> url)
>         ("tag:atomize.lhs,1970:error") url sentinelDate)
>   [ Entry ("Could not build an Atom feed for " <> url) url
>           ("tag:atomize.lhs,1970:error") sentinelDate (Just note) ]

Report page (type-7 result -> a menu)
-------------------------------------

> emitReport :: Ctx -> T.Text -> IO ()
> emitReport ctx url = do
>   Outcome note mFeed <- deriveFeed url
>   putLine (infoLine "atomize -- turn a web page into an Atom feed.")
>   putLine (infoLine ("Target: " <> url))
>   putLine (infoLine "")
>   case mFeed of
>     Just _ -> do
>       let enc      = percentEncodeUrl url
>           rawSel   = ctxScriptSel ctx <> "/" <> enc
>           portPart = if ctxPort ctx == "70" then "" else ":" <> ctxPort ctx
>           rawUri   = "gopher://" <> ctxHost ctx <> portPart
>                        <> "/0" <> rawSel
>       putLine (infoLine note)
>       putLine (infoLine "")
>       putLine (menuRow '0' "Atom feed -- open or save" rawSel
>                  (ctxHost ctx) (ctxPort ctx))
>       putLine (infoLine "One-shot from a shell:")
>       putLine (infoLine ("  curl " <> rawUri <> " > feed.xml"))
>     Nothing -> do
>       putLine (errorItem ("Failed: " <> note))
>       putLine (infoLine "")
>       putLine (infoLine "Try a more specific page (a blog index, a")
>       putLine (infoLine "category page) or a known feed URL directly.")
>   mapM_ putLine (footerRows ctx)
>   putLine terminator

Landing page
------------

> emitLanding :: Ctx -> IO ()
> emitLanding ctx = do
>   let portPart = if ctxPort ctx == "70" then "" else ":" <> ctxPort ctx
>       base     = "gopher://" <> ctxHost ctx <> portPart <> "/0" <> ctxScriptSel ctx
>   mapM_ putLine
>     [ infoLine "atomize -- turn any web page into an Atom feed."
>     , infoLine "Deterministic: discover a declared feed, convert RSS,"
>     , infoLine "probe common paths, or synthesise from page links."
>     , infoLine ""
>     , searchLine "Paste a web-page URL" (ctxScriptSel ctx) (ctxHost ctx) (ctxPort ctx)
>     , infoLine ""
>     , infoLine "Or go straight to the raw feed from a shell:"
>     , infoLine ("  curl " <> base <> "/example.com/blog")
>     , infoLine "(the URL is the selector; bare host/path defaults to https)"
>     ]
>   mapM_ putLine (footerRows ctx)
>   putLine terminator
>
> footerRows :: Ctx -> [T.Text]
> footerRows ctx =
>   [ infoLine ""
>   , menuRow '1' "-- served over gopher * no js * no ads * no tracking --"
>             "" (ctxHost ctx) (ctxPort ctx) ]

Selector encoding
-----------------

The page URL becomes one opaque path-info segment: escape everything
but the URI-unreserved set, so its slashes and colon survive intact.

> -- | >>> percentEncodeUrl "https://a.com/b?x=1"
> -- "https%3A%2F%2Fa.com%2Fb%3Fx%3D1"
> percentEncodeUrl :: T.Text -> T.Text
> percentEncodeUrl = T.pack . escapeURIString isUnreserved . T.unpack

Gophermap line builders + sanitisation
--------------------------------------

> putLine :: T.Text -> IO ()
> putLine t = TIO.putStr (t <> "\r\n") >> hFlush stdout
>
> -- | >>> infoLine "hello"
> -- "ihello\t\t\t0"
> infoLine :: T.Text -> T.Text
> infoLine msg = "i" <> sanitize msg <> "\t\t\t0"
>
> -- | >>> menuRow '1' "home" "/" "host" "70"
> -- "1home\t/\thost\t70"
> menuRow :: Char -> T.Text -> T.Text -> T.Text -> T.Text -> T.Text
> menuRow t display selector host port =
>   T.singleton t <> sanitize display <> "\t" <> selector
>     <> "\t" <> host <> "\t" <> port
>
> -- | >>> searchLine "Search" "/q" "host" "70"
> -- "7Search\t/q\thost\t70"
> searchLine :: T.Text -> T.Text -> T.Text -> T.Text -> T.Text
> searchLine display sel host port =
>   "7" <> sanitize display <> "\t" <> sel <> "\t" <> host <> "\t" <> port
>
> -- | >>> errorItem "nope"
> -- "3nope\t\t\t0"
> errorItem :: T.Text -> T.Text
> errorItem msg = "3" <> sanitize msg <> "\t\t\t0"
>
> terminator :: T.Text
> terminator = "."
>
> -- | >>> sanitize "a\tb\nc"
> -- "a b c"
> sanitize :: T.Text -> T.Text
> sanitize = T.map (\c -> if c == '\r' || c == '\n' || c == '\t' then ' ' else c)

Small pure utilities
--------------------

> -- | First non-empty (after trimming) text, or "" if all blank.
> --
> -- >>> firstNonEmpty ["", "  ", "x", "y"]
> -- "x"
> firstNonEmpty :: [T.Text] -> T.Text
> firstNonEmpty = fromMaybe "" . find (not . T.null) . map T.strip
>
> -- | Keep the first occurrence of each first-component, order-stable.
> --
> -- >>> dedupeByFst [("a",1),("b",2),("a",3)]
> -- [("a",1),("b",2)]
> dedupeByFst :: Eq a => [(a, b)] -> [(a, b)]
> dedupeByFst xs = go [] xs
>   where
>     go _    []          = []
>     go seen ((k,v):rest)
>       | k `elem` seen = go seen rest
>       | otherwise     = (k,v) : go (k:seen) rest