codemadness.org/1/git/randomcrap/file/config/sfeed/filter.gph

       filter - randomcrap - random crap programs of varying quality
  HTML git clone git://git.codemadness.org/randomcrap
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       filter (4740B)
       ---
            1 LC_ALL=C
            2 AWK="awk"
            3 
            4 # filter fields.
            5 # filter(name)
            6 filter() {
            7         case "$1" in
            8         "anandtech")
            9                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           10                 # shorten link.
           11                 {
           12                         if (match($3, /^https:\/\/www\.anandtech\.com\/show\/[0-9]+\//)) {
           13                                 $3 = substr($3, RSTART, RLENGTH);
           14                         }
           15                         print $0;
           16                 }';;
           17         "b.net sc2")
           18                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           19                 # fix urls in battle.net feed.
           20                 {
           21                         # http://us.battle.net/en/blog/22882717
           22                         sub(/^(http|https):\/\/us\.battle\.net\/en\/blog\//, "https://starcraft2.com/en-us/news/", $3);
           23                         print $0;
           24                 }';;
           25         "bitreich news")
           26                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           27                 # <content> attribute is wrong, force to HTML.
           28                 {
           29                         #$5 = "html";
           30                         print $0;
           31                 }';;
           32         "dagblad noorden")
           33                 # this feed sometimes has no title, use the description then.
           34                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           35                 {
           36                         if (length($2) == 0) {
           37                                 title = $4;
           38                                 # strip tags.
           39                                 gsub(/<[^>*]>/, "", title);
           40                                 # trim whitespace.
           41                                 gsub(/^[ ]*/, "", title);
           42                                 gsub(/[ ]*$/, "", title);
           43                                 $2 = title;
           44                         }
           45                         print $0;
           46                 }';;
           47         "yt geenstijl")
           48                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           49                 $4 !~ /brante@immink/ { print $0 } # skip podcasts of Brante & Immink.
           50                 ';;
           51         "hardware.info")
           52                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           53                 # skip ads.
           54                 $2 ~ /^Advertorial:/ {
           55                         next;
           56                 }
           57                 # skip sponsored biased content.
           58                 $2 ~ /^Sponsored:/ {
           59                         next;
           60                 }
           61                 # fixup HTTP to HTTPS link, shorten link.
           62                 {
           63                         sub(/^http:\/\/nl\.hardware\.info/, "https://nl.hardware.info", $3);
           64                         if (match($3, /^https:\/\/nl\.hardware\.info\/nieuws\/[0-9]+\//)) {
           65                                 $3 = substr($3, RSTART, RLENGTH);
           66                         }
           67                         print $0;
           68                 }';;
           69         "ifixit blog")
           70                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           71                 # shorten link.
           72                 {
           73                         if (match($3, /^https:\/\/ifixit\.org\/blog\/[0-9]+\//)) {
           74                                 $3 = substr($3, RSTART, RLENGTH);
           75                         }
           76                         print $0;
           77                 }';;
           78         nist*)
           79                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           80                 {
           81                         # add content to title.
           82                         $2 = $2 " " $4;
           83                         print $0
           84                 }';;
           85         nu.nl*)
           86                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           87                 # shorten link.
           88                 {
           89                         if (match($3, /^https:\/\/www\.nu\.nl\/[a-z]+\/[0-9]+\//)) {
           90                                 $3 = substr($3, RSTART, RLENGTH);
           91                         }
           92                         print $0;
           93                 }';;
           94         "osnews")
           95                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
           96                 # shorten link.
           97                 {
           98                         if (match($3, /^(http|https):\/\/osnews\.com\/story\/[0-9]+\//)) {
           99                                 $3 = substr($3, RSTART, RLENGTH);
          100                         }
          101                         print $0;
          102                 }';;
          103         reddit*)
          104                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
          105                 # shorten link.
          106                 {
          107                         if (match($3, /^https:\/\/old.reddit.com\/r\/[^\/]*\/comments\/[^\/]*\//)) {
          108                                 $3 = substr($3, RSTART, RLENGTH);
          109                         }
          110                         print $0;
          111                 }';;
          112         "rtvdrenthe")
          113                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
          114                 # shorten link.
          115                 {
          116                         if (match($3, /^https:\/\/www\.rtvdrenthe\.nl\/nieuws\/[0-9]+\//)) {
          117                                 $3 = substr($3, RSTART, RLENGTH);
          118                         }
          119                         print $0;
          120                 }';;
          121         "rtvnoord")
          122                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
          123                 # shorten link.
          124                 {
          125                         if (match($3, /^https:\/\/www\.rtvnoord\.nl\/nieuws\/[0-9]+\//)) {
          126                                 $3 = substr($3, RSTART, RLENGTH) "a";
          127                         }
          128                         print $0;
          129                 }';;
          130         techpowerup*)
          131                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
          132                 # shorten link.
          133                 {
          134                         if (match($3, /^https:\/\/www\.techpowerup\.com\/[0-9]+\//)) {
          135                                 $3 = substr($3, RSTART, RLENGTH);
          136                         }
          137                         print $0;
          138                 }';;
          139         "tweakers")
          140                 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
          141                 # skip ads.
          142                 $2 ~ /^ADV:/ {
          143                         next;
          144                 }
          145                 # shorten link.
          146                 {
          147                         if (match($3, /^https:\/\/tweakers\.net\/[a-z]+\/[0-9]+\//)) {
          148                                 $3 = substr($3, RSTART, RLENGTH);
          149                         }
          150                         print $0;
          151                 }';;
          152         "yt BSDNow")
          153                 # filter only BSD Now from channel.
          154                 $AWK -F '\t' '$2 ~ / \| BSD Now/';;
          155         *)
          156                 cat;;
          157         esac | \
          158                 # replace youtube links with embed links.
          159                 sed 's@www.youtube.com/watch?v=@www.youtube.com/embed/@g' | \
          160 
          161                 LC_ALL=C $AWK -F '\t' 'BEGIN { OFS = "\t"; }
          162                 function filterlink(s) {
          163                         # protocol must start with http, https or gopher.
          164                         if (match(s, /^(http|https|gopher):\/\//) == 0) {
          165                                 return "";
          166                         }
          167 
          168                         # shorten feedburner links.
          169                         if (match(s, /^(http|https):\/\/[^\/]+\/~r\/.*\/~3\/[^\/]+\//)) {
          170                                 s = substr($3, RSTART, RLENGTH);
          171                         }
          172 
          173                         # strip tracking parameters
          174                         # urchin, facebook, piwik, webtrekk and generic.
          175                         gsub(/\?(ad|campaign|pk|tm|wt|fbclid|utm)_([^&]+)/, "?", s);
          176                         gsub(/&(ad|campaign|pk|tm|wt|fbclid|utm)_([^&]+)/, "", s);
          177 
          178                         gsub(/\?&/, "?", s);
          179                         gsub(/[\?&]+$/, "", s);
          180 
          181                         return s
          182                 }
          183                 function filtertitle(s) {
          184                         #gsub("&#821[67];", "'"'"'", s);
          185                         #gsub("<[/]?em>", "_", s);
          186                         #gsub("<[/]?b>", "*", s);
          187                         #gsub("<[/]?nobr>", "", s);
          188                         #gsub("<[/]?wbr>", "", s);
          189                         return s;
          190                 }
          191                 {
          192                         $2 = filtertitle($2); # title
          193 
          194                         $3 = filterlink($3); # link
          195                         $8 = filterlink($8); # enclosure
          196 
          197                         # try to remove tracking pixels: <img/> tags with 1px width or height.
          198                         gsub("<img[^>]*(width|height)[[:space:]]*=[[:space:]]*[\"'"'"' ]?1[\"'"'"' ]?[^0-9>]+[^>]*>", "", $4);
          199 
          200                         print $0;
          201                 }'
          202 }