filter - randomcrap - random crap programs of varying quality
HTML git clone git://git.codemadness.org/randomcrap
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
filter (4740B)
---
1 LC_ALL=C
2 AWK="awk"
3
4 # filter fields.
5 # filter(name)
6 filter() {
7 case "$1" in
8 "anandtech")
9 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
10 # shorten link.
11 {
12 if (match($3, /^https:\/\/www\.anandtech\.com\/show\/[0-9]+\//)) {
13 $3 = substr($3, RSTART, RLENGTH);
14 }
15 print $0;
16 }';;
17 "b.net sc2")
18 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
19 # fix urls in battle.net feed.
20 {
21 # http://us.battle.net/en/blog/22882717
22 sub(/^(http|https):\/\/us\.battle\.net\/en\/blog\//, "https://starcraft2.com/en-us/news/", $3);
23 print $0;
24 }';;
25 "bitreich news")
26 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
27 # <content> attribute is wrong, force to HTML.
28 {
29 #$5 = "html";
30 print $0;
31 }';;
32 "dagblad noorden")
33 # this feed sometimes has no title, use the description then.
34 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
35 {
36 if (length($2) == 0) {
37 title = $4;
38 # strip tags.
39 gsub(/<[^>*]>/, "", title);
40 # trim whitespace.
41 gsub(/^[ ]*/, "", title);
42 gsub(/[ ]*$/, "", title);
43 $2 = title;
44 }
45 print $0;
46 }';;
47 "yt geenstijl")
48 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
49 $4 !~ /brante@immink/ { print $0 } # skip podcasts of Brante & Immink.
50 ';;
51 "hardware.info")
52 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
53 # skip ads.
54 $2 ~ /^Advertorial:/ {
55 next;
56 }
57 # skip sponsored biased content.
58 $2 ~ /^Sponsored:/ {
59 next;
60 }
61 # fixup HTTP to HTTPS link, shorten link.
62 {
63 sub(/^http:\/\/nl\.hardware\.info/, "https://nl.hardware.info", $3);
64 if (match($3, /^https:\/\/nl\.hardware\.info\/nieuws\/[0-9]+\//)) {
65 $3 = substr($3, RSTART, RLENGTH);
66 }
67 print $0;
68 }';;
69 "ifixit blog")
70 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
71 # shorten link.
72 {
73 if (match($3, /^https:\/\/ifixit\.org\/blog\/[0-9]+\//)) {
74 $3 = substr($3, RSTART, RLENGTH);
75 }
76 print $0;
77 }';;
78 nist*)
79 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
80 {
81 # add content to title.
82 $2 = $2 " " $4;
83 print $0
84 }';;
85 nu.nl*)
86 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
87 # shorten link.
88 {
89 if (match($3, /^https:\/\/www\.nu\.nl\/[a-z]+\/[0-9]+\//)) {
90 $3 = substr($3, RSTART, RLENGTH);
91 }
92 print $0;
93 }';;
94 "osnews")
95 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
96 # shorten link.
97 {
98 if (match($3, /^(http|https):\/\/osnews\.com\/story\/[0-9]+\//)) {
99 $3 = substr($3, RSTART, RLENGTH);
100 }
101 print $0;
102 }';;
103 reddit*)
104 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
105 # shorten link.
106 {
107 if (match($3, /^https:\/\/old.reddit.com\/r\/[^\/]*\/comments\/[^\/]*\//)) {
108 $3 = substr($3, RSTART, RLENGTH);
109 }
110 print $0;
111 }';;
112 "rtvdrenthe")
113 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
114 # shorten link.
115 {
116 if (match($3, /^https:\/\/www\.rtvdrenthe\.nl\/nieuws\/[0-9]+\//)) {
117 $3 = substr($3, RSTART, RLENGTH);
118 }
119 print $0;
120 }';;
121 "rtvnoord")
122 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
123 # shorten link.
124 {
125 if (match($3, /^https:\/\/www\.rtvnoord\.nl\/nieuws\/[0-9]+\//)) {
126 $3 = substr($3, RSTART, RLENGTH) "a";
127 }
128 print $0;
129 }';;
130 techpowerup*)
131 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
132 # shorten link.
133 {
134 if (match($3, /^https:\/\/www\.techpowerup\.com\/[0-9]+\//)) {
135 $3 = substr($3, RSTART, RLENGTH);
136 }
137 print $0;
138 }';;
139 "tweakers")
140 $AWK -F '\t' 'BEGIN { OFS = "\t"; }
141 # skip ads.
142 $2 ~ /^ADV:/ {
143 next;
144 }
145 # shorten link.
146 {
147 if (match($3, /^https:\/\/tweakers\.net\/[a-z]+\/[0-9]+\//)) {
148 $3 = substr($3, RSTART, RLENGTH);
149 }
150 print $0;
151 }';;
152 "yt BSDNow")
153 # filter only BSD Now from channel.
154 $AWK -F '\t' '$2 ~ / \| BSD Now/';;
155 *)
156 cat;;
157 esac | \
158 # replace youtube links with embed links.
159 sed 's@www.youtube.com/watch?v=@www.youtube.com/embed/@g' | \
160
161 LC_ALL=C $AWK -F '\t' 'BEGIN { OFS = "\t"; }
162 function filterlink(s) {
163 # protocol must start with http, https or gopher.
164 if (match(s, /^(http|https|gopher):\/\//) == 0) {
165 return "";
166 }
167
168 # shorten feedburner links.
169 if (match(s, /^(http|https):\/\/[^\/]+\/~r\/.*\/~3\/[^\/]+\//)) {
170 s = substr($3, RSTART, RLENGTH);
171 }
172
173 # strip tracking parameters
174 # urchin, facebook, piwik, webtrekk and generic.
175 gsub(/\?(ad|campaign|pk|tm|wt|fbclid|utm)_([^&]+)/, "?", s);
176 gsub(/&(ad|campaign|pk|tm|wt|fbclid|utm)_([^&]+)/, "", s);
177
178 gsub(/\?&/, "?", s);
179 gsub(/[\?&]+$/, "", s);
180
181 return s
182 }
183 function filtertitle(s) {
184 #gsub("̵[67];", "'"'"'", s);
185 #gsub("<[/]?em>", "_", s);
186 #gsub("<[/]?b>", "*", s);
187 #gsub("<[/]?nobr>", "", s);
188 #gsub("<[/]?wbr>", "", s);
189 return s;
190 }
191 {
192 $2 = filtertitle($2); # title
193
194 $3 = filterlink($3); # link
195 $8 = filterlink($8); # enclosure
196
197 # try to remove tracking pixels: <img/> tags with 1px width or height.
198 gsub("<img[^>]*(width|height)[[:space:]]*=[[:space:]]*[\"'"'"' ]?1[\"'"'"' ]?[^0-9>]+[^>]*>", "", $4);
199
200 print $0;
201 }'
202 }