ttwokenize.py - counterfacto - small software tool to analyze twitter and highlight counterfactual statements
HTML git clone git://parazyd.org/counterfacto.git
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
ttwokenize.py (13000B)
---
1 # -*- coding: utf-8 -*-
2 """
3 Twokenize -- a tokenizer designed for Twitter text in English and some other European languages.
4 This tokenizer code has gone through a long history:
5
6 (1) Brendan O'Connor wrote original version in Python, http://github.com/brendano/tweetmotif
7 TweetMotif: Exploratory Search and Topic Summarization for Twitter.
8 Brendan O'Connor, Michel Krieger, and David Ahn.
9 ICWSM-2010 (demo track), http://brenocon.com/oconnor_krieger_ahn.icwsm2010.tweetmotif.pdf
10 (2a) Kevin Gimpel and Daniel Mills modified it for POS tagging for the CMU ARK Twitter POS Tagger
11 (2b) Jason Baldridge and David Snyder ported it to Scala
12 (3) Brendan bugfixed the Scala port and merged with POS-specific changes
13 for the CMU ARK Twitter POS Tagger
14 (4) Tobi Owoputi ported it back to Java and added many improvements (2012-06)
15
16 Current home is http://github.com/brendano/ark-tweet-nlp and http://www.ark.cs.cmu.edu/TweetNLP
17
18 There have been at least 2 other Java ports, but they are not in the lineage for the code here.
19
20 Ported to Python by Myle Ott <myleott@gmail.com>.
21 """
22
23 from __future__ import print_function
24
25 import operator
26 import re
27 import HTMLParser
28
29 def regex_or(*items):
30 return '(?:' + '|'.join(items) + ')'
31
32 Contractions = re.compile(u"(?i)(\w+)(n['’′]t|['’′]ve|['’′]ll|['’′]d|['’′]re|['’′]s|['’′]m)$", re.UNICODE)
33 Whitespace = re.compile(u"[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+", re.UNICODE)
34
35 punctChars = r"['\"“”‘’.?!…,:;]"
36 #punctSeq = punctChars+"+" #'anthem'. => ' anthem '.
37 punctSeq = r"['\"“”‘’]+|[.?!,…]+|[:;]+" #'anthem'. => ' anthem ' .
38 entity = r"&(?:amp|lt|gt|quot);"
39 # URLs
40
41
42 # BTO 2012-06: everyone thinks the daringfireball regex should be better, but they're wrong.
43 # If you actually empirically test it the results are bad.
44 # Please see https://github.com/brendano/ark-tweet-nlp/pull/9
45
46 urlStart1 = r"(?:https?://|\bwww\.)"
47 commonTLDs = r"(?:com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|pro|tel|travel|xxx)"
48 ccTLDs = r"(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|" + \
49 r"bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|" + \
50 r"er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|" + \
51 r"hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|" + \
52 r"lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|" + \
53 r"nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|" + \
54 r"sl|sm|sn|so|sr|ss|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|" + \
55 r"va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw)" #TODO: remove obscure country domains?
56 urlStart2 = r"\b(?:[A-Za-z\d-])+(?:\.[A-Za-z0-9]+){0,3}\." + regex_or(commonTLDs, ccTLDs) + r"(?:\."+ccTLDs+r")?(?=\W|$)"
57 urlBody = r"(?:[^\.\s<>][^\s<>]*?)?"
58 urlExtraCrapBeforeEnd = regex_or(punctChars, entity) + "+?"
59 urlEnd = r"(?:\.\.+|[<>]|\s|$)"
60 url = regex_or(urlStart1, urlStart2) + urlBody + "(?=(?:"+urlExtraCrapBeforeEnd+")?"+urlEnd+")"
61
62
63 # Numeric
64 timeLike = r"\d+(?::\d+){1,2}"
65 #numNum = r"\d+\.\d+"
66 numberWithCommas = r"(?:(?<!\d)\d{1,3},)+?\d{3}" + r"(?=(?:[^,\d]|$))"
67 numComb = u"[\u0024\u058f\u060b\u09f2\u09f3\u09fb\u0af1\u0bf9\u0e3f\u17db\ua838\ufdfc\ufe69\uff04\uffe0\uffe1\uffe5\uffe6\u00a2-\u00a5\u20a0-\u20b9]?\\d+(?:\\.\\d+)+%?".encode('utf-8')
68
69 # Abbreviations
70 boundaryNotDot = regex_or("$", r"\s", r"[“\"?!,:;]", entity)
71 aa1 = r"(?:[A-Za-z]\.){2,}(?=" + boundaryNotDot + ")"
72 aa2 = r"[^A-Za-z](?:[A-Za-z]\.){1,}[A-Za-z](?=" + boundaryNotDot + ")"
73 standardAbbreviations = r"\b(?:[Mm]r|[Mm]rs|[Mm]s|[Dd]r|[Ss]r|[Jj]r|[Rr]ep|[Ss]en|[Ss]t)\."
74 arbitraryAbbrev = regex_or(aa1, aa2, standardAbbreviations)
75 separators = "(?:--+|―|—|~|–|=)"
76 decorations = u"(?:[♫♪]+|[★☆]+|[♥❤♡]+|[\u2639-\u263b]+|[\ue001-\uebbb]+)".encode('utf-8')
77 thingsThatSplitWords = r"[^\s\.,?\"]"
78 embeddedApostrophe = thingsThatSplitWords+r"+['’′]" + thingsThatSplitWords + "*"
79
80 # Emoticons
81 # myleott: in Python the (?iu) flags affect the whole expression
82 #normalEyes = "(?iu)[:=]" # 8 and x are eyes but cause problems
83 normalEyes = "[:=]" # 8 and x are eyes but cause problems
84 wink = "[;]"
85 noseArea = "(?:|-|[^a-zA-Z0-9 ])" # doesn't get :'-(
86 happyMouths = r"[D\)\]\}]+"
87 sadMouths = r"[\(\[\{]+"
88 tongue = "[pPd3]+"
89 otherMouths = r"(?:[oO]+|[/\\]+|[vV]+|[Ss]+|[|]+)" # remove forward slash if http://'s aren't cleaned
90
91 # mouth repetition examples:
92 # @aliciakeys Put it in a love song :-))
93 # @hellocalyclops =))=))=)) Oh well
94
95 # myleott: try to be as case insensitive as possible, but still not perfect, e.g., o.O fails
96 #bfLeft = u"(♥|0|o|°|v|\\$|t|x|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
97 bfLeft = u"(♥|0|[oO]|°|[vV]|\\$|[tT]|[xX]|;|\u0ca0|@|ʘ|•|・|◕|\\^|¬|\\*)".encode('utf-8')
98 bfCenter = r"(?:[\.]|[_-]+)"
99 bfRight = r"\2"
100 s3 = r"(?:--['\"])"
101 s4 = r"(?:<|<|>|>)[\._-]+(?:<|<|>|>)"
102 s5 = "(?:[.][_]+[.])"
103 # myleott: in Python the (?i) flag affects the whole expression
104 #basicface = "(?:(?i)" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
105 basicface = "(?:" +bfLeft+bfCenter+bfRight+ ")|" +s3+ "|" +s4+ "|" + s5
106
107 eeLeft = r"[\\\ƪԄ\((<>;ヽ\-=~\*]+"
108 eeRight= u"[\\-=\\);'\u0022<>ʃ)//ノノ丿╯σっµ~\\*]+".encode('utf-8')
109 eeSymbol = r"[^A-Za-z0-9\s\(\)\*:=-]"
110 eastEmote = eeLeft + "(?:"+basicface+"|" +eeSymbol+")+" + eeRight
111
112 oOEmote = r"(?:[oO]" + bfCenter + r"[oO])"
113
114
115 emoticon = regex_or(
116 # Standard version :) :( :] :D :P
117 "(?:>|>)?" + regex_or(normalEyes, wink) + regex_or(noseArea,"[Oo]") + regex_or(tongue+r"(?=\W|$|RT|rt|Rt)", otherMouths+r"(?=\W|$|RT|rt|Rt)", sadMouths, happyMouths),
118
119 # reversed version (: D: use positive lookbehind to remove "(word):"
120 # because eyes on the right side is more ambiguous with the standard usage of : ;
121 regex_or("(?<=(?: ))", "(?<=(?:^))") + regex_or(sadMouths,happyMouths,otherMouths) + noseArea + regex_or(normalEyes, wink) + "(?:<|<)?",
122
123 #inspired by http://en.wikipedia.org/wiki/User:Scapler/emoticons#East_Asian_style
124 eastEmote.replace("2", "1", 1), basicface,
125 # iOS 'emoji' characters (some smileys, some symbols) [\ue001-\uebbb]
126 # TODO should try a big precompiled lexicon from Wikipedia, Dan Ramage told me (BTO) he does this
127
128 # myleott: o.O and O.o are two of the biggest sources of differences
129 # between this and the Java version. One little hack won't hurt...
130 oOEmote
131 )
132
133 Hearts = "(?:<+/?3+)+" #the other hearts are in decorations
134
135 Arrows = regex_or(r"(?:<*[-―—=]*>+|<+[-―—=]*>*)", u"[\u2190-\u21ff]+".encode('utf-8'))
136
137 # BTO 2011-06: restored Hashtag, AtMention protection (dropped in original scala port) because it fixes
138 # "hello (#hashtag)" ==> "hello (#hashtag )" WRONG
139 # "hello (#hashtag)" ==> "hello ( #hashtag )" RIGHT
140 # "hello (@person)" ==> "hello (@person )" WRONG
141 # "hello (@person)" ==> "hello ( @person )" RIGHT
142 # ... Some sort of weird interaction with edgepunct I guess, because edgepunct
143 # has poor content-symbol detection.
144
145 # This also gets #1 #40 which probably aren't hashtags .. but good as tokens.
146 # If you want good hashtag identification, use a different regex.
147 Hashtag = "#[a-zA-Z0-9_]+" #optional: lookbehind for \b
148 #optional: lookbehind for \b, max length 15
149 AtMention = "[@@][a-zA-Z0-9_]+"
150
151 # I was worried this would conflict with at-mentions
152 # but seems ok in sample of 5800: 7 changes all email fixes
153 # http://www.regular-expressions.info/email.html
154 Bound = r"(?:\W|^|$)"
155 Email = regex_or("(?<=(?:\W))", "(?<=(?:^))") + r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}(?=" +Bound+")"
156
157 # We will be tokenizing using these regexps as delimiters
158 # Additionally, these things are "protected", meaning they shouldn't be further split themselves.
159 Protected = re.compile(
160 unicode(regex_or(
161 Hearts,
162 url,
163 Email,
164 timeLike,
165 #numNum,
166 numberWithCommas,
167 numComb,
168 emoticon,
169 Arrows,
170 entity,
171 punctSeq,
172 arbitraryAbbrev,
173 separators,
174 decorations,
175 embeddedApostrophe,
176 Hashtag,
177 AtMention
178 ).decode('utf-8')), re.UNICODE)
179
180 # Edge punctuation
181 # Want: 'foo' => ' foo '
182 # While also: don't => don't
183 # the first is considered "edge punctuation".
184 # the second is word-internal punctuation -- don't want to mess with it.
185 # BTO (2011-06): the edgepunct system seems to be the #1 source of problems these days.
186 # I remember it causing lots of trouble in the past as well. Would be good to revisit or eliminate.
187
188 # Note the 'smart quotes' (http://en.wikipedia.org/wiki/Smart_quotes)
189 #edgePunctChars = r"'\"“”‘’«»{}\(\)\[\]\*&" #add \\p{So}? (symbols)
190 edgePunctChars = u"'\"“”‘’«»{}\\(\\)\\[\\]\\*&" #add \\p{So}? (symbols)
191 edgePunct = "[" + edgePunctChars + "]"
192 notEdgePunct = "[a-zA-Z0-9]" # content characters
193 offEdge = r"(^|$|:|;|\s|\.|,)" # colon here gets "(hello):" ==> "( hello ):"
194 EdgePunctLeft = re.compile(offEdge + "("+edgePunct+"+)("+notEdgePunct+")", re.UNICODE)
195 EdgePunctRight = re.compile("("+notEdgePunct+")("+edgePunct+"+)" + offEdge, re.UNICODE)
196
197 def splitEdgePunct(input):
198 input = EdgePunctLeft.sub(r"\1\2 \3", input)
199 input = EdgePunctRight.sub(r"\1 \2\3", input)
200 return input
201
202 # The main work of tokenizing a tweet.
203 def simpleTokenize(text):
204
205 # Do the no-brainers first
206 splitPunctText = splitEdgePunct(text)
207
208 textLength = len(splitPunctText)
209
210 # BTO: the logic here got quite convoluted via the Scala porting detour
211 # It would be good to switch back to a nice simple procedural style like in the Python version
212 # ... Scala is such a pain. Never again.
213
214 # Find the matches for subsequences that should be protected,
215 # e.g. URLs, 1.0, U.N.K.L.E., 12:53
216 bads = []
217 badSpans = []
218 for match in Protected.finditer(splitPunctText):
219 # The spans of the "bads" should not be split.
220 if (match.start() != match.end()): #unnecessary?
221 bads.append( [splitPunctText[match.start():match.end()]] )
222 badSpans.append( (match.start(), match.end()) )
223
224 # Create a list of indices to create the "goods", which can be
225 # split. We are taking "bad" spans like
226 # List((2,5), (8,10))
227 # to create
228 # List(0, 2, 5, 8, 10, 12)
229 # where, e.g., "12" here would be the textLength
230 # has an even length and no indices are the same
231 indices = [0]
232 for (first, second) in badSpans:
233 indices.append(first)
234 indices.append(second)
235 indices.append(textLength)
236
237 # Group the indices and map them to their respective portion of the string
238 splitGoods = []
239 for i in range(0, len(indices), 2):
240 goodstr = splitPunctText[indices[i]:indices[i+1]]
241 splitstr = goodstr.strip().split(" ")
242 splitGoods.append(splitstr)
243
244 # Reinterpolate the 'good' and 'bad' Lists, ensuring that
245 # additonal tokens from last good item get included
246 zippedStr = []
247 for i in range(len(bads)):
248 zippedStr = addAllnonempty(zippedStr, splitGoods[i])
249 zippedStr = addAllnonempty(zippedStr, bads[i])
250 zippedStr = addAllnonempty(zippedStr, splitGoods[len(bads)])
251
252 # BTO: our POS tagger wants "ur" and "you're" to both be one token.
253 # Uncomment to get "you 're"
254 #splitStr = []
255 #for tok in zippedStr:
256 # splitStr.extend(splitToken(tok))
257 #zippedStr = splitStr
258
259 return zippedStr
260
261 def addAllnonempty(master, smaller):
262 for s in smaller:
263 strim = s.strip()
264 if (len(strim) > 0):
265 master.append(strim)
266 return master
267
268 # "foo bar " => "foo bar"
269 def squeezeWhitespace(input):
270 return Whitespace.sub(" ", input).strip()
271
272 # Final pass tokenization based on special patterns
273 def splitToken(token):
274 m = Contractions.search(token)
275 if m:
276 return [m.group(1), m.group(2)]
277 return [token]
278
279 # Assume 'text' has no HTML escaping.
280 def tokenize(text):
281 return simpleTokenize(squeezeWhitespace(text))
282
283
284 # Twitter text comes HTML-escaped, so unescape it.
285 # We also first unescape &'s, in case the text has been buggily double-escaped.
286 def normalizeTextForTagger(text):
287 text = text.replace("&", "&")
288 text = HTMLParser.HTMLParser().unescape(text)
289 return text
290
291 # This is intended for raw tweet text -- we do some HTML entity unescaping before running the tagger.
292 #
293 # This function normalizes the input text BEFORE calling the tokenizer.
294 # So the tokens you get back may not exactly correspond to
295 # substrings of the original text.
296 def tokenizeRawTweetText(text):
297 tokens = tokenize(normalizeTextForTagger(text))
298 return tokens