tfactolib.py - counterfacto - small software tool to analyze twitter and highlight counterfactual statements
HTML git clone git://parazyd.org/counterfacto.git
DIR Log
DIR Files
DIR Refs
DIR README
DIR LICENSE
---
tfactolib.py (5455B)
---
1 #!/usr/bin/env python2
2 # Copyright (c) 2017 Ivan J. <parazyd@dyne.org
3
4 import nltk
5 import re
6 import twokenize
7 from nltk.tag.perceptron import PerceptronTagger
8
9 def tokenizelocal():
10 tweets = tweetFile.read().splitlines()
11 for t in tweets:
12 print(t + '\n')
13 print(str(twokenize.tokenize(t)) + '\n')
14
15 def format_tweet(message):
16 m = str(message)
17 m = m.replace('\n', ' ')
18 m = m.encode('ascii', 'ignore')
19 return m
20
21 def format_tagged(tagged_list):
22 out = ''
23 for t in tagged_list:
24 token, tag = postprocess_tag(t[0], t[1])
25 out = out + token + '/' + tag + '/'
26 out = out + '\n'
27 return out
28
29 def postprocess_tag(token, tag):
30 outtag = tag
31 if (is_twitter_cf_modal(token)):
32 outtag = 'MD'
33 elif (tag_CCJ(token)):
34 outtag = 'CCJ'
35 return token, outtag
36
37 def get_cf_form(tagged_message):
38
39 # Filter out questions
40 pq = re.compile('\.*/\?/.', re.IGNORECASE)
41 if pq.search(tagged_message) != None:
42 return 0
43
44 # CASE 1 WISH VERB FORM
45 p1 = re.compile('\.*(wish|wishing)/((VB.*/)|(JJ/))', re.IGNORECASE)
46 if p1.search(tagged_message) != None:
47 return 1
48
49
50 # CASE 2 CONJUNTION NORMAL
51 p2 = re.compile('\.*/CCJ/.*((/VBD/)|(/VBN/)).*/MD/', re.IGNORECASE)
52 if p2.search(tagged_message) != None:
53 return 2
54
55
56 # CASE 3 CONJUNCTIVE CONVERSE
57 p3 = re.compile('\.*/MD/.*/CCJ/.*((/VBN/)|(/VBD/))', re.IGNORECASE)
58 if p3.search(tagged_message) != None:
59 return 3
60
61
62 # CASE 5 Should have
63 p4 = re.compile('\.*/((should\'ve)/MD/)|(((should)|(shoulda)(shulda)|(shuda)|(shudda)|(shudve))/MD/((have)|(hve)|(ve))/)(\w)*((/VBN/)|(/VBD/))', re.IGNORECASE)
64 if p4.search(tagged_message) != None:
65 return 4
66
67 # CASE 6 VERB INVERSION
68 p5 = re.compile(("\.*(had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)).*/MD/)"
69 "|(were/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*/MD/)"
70 "|(/MD/.*/VB.*/had/(\w)*/(\w)*((/NN/)|(/NNP/)|(/NNPS/)|(/NNS/)|(/PRP/)).*((/VBN/)|(/VBD/)))"), re.IGNORECASE)
71 if p5.search(tagged_message) != None:
72 return 5
73
74
75 # CASE 6 MODAL NORMAL
76 p6 = re.compile('\.*/MD/.*((/VBN/)|(/VBD/)).*/MD/.*((/VBN/)|(/VBD/)|(/VB/)|(VBZ))', re.IGNORECASE)
77 if p6.search(tagged_message) != None:
78 return 6
79
80 # If no matches
81 return 0
82
83
84 def is_twitter_cf_modal(word):
85 w = unicode(word, errors='ignore').encode('utf-8').lower()
86 if (w == 'should' or
87 w == 'should\'ve' or
88 w == 'shouldve' or
89 w == 'shoulda' or
90 w == 'shulda' or
91 w == 'shuda' or
92 w == 'shudda' or
93 w == 'shudve' or
94 w == 'would' or
95 w == 'would\'ve' or
96 w == 'wouldve' or
97 w == 'woulda' or
98 w == 'wuda' or
99 w == 'wulda' or
100 w == 'wudda' or
101 w == 'wudve' or
102 w == 'wlda' or
103 w == 'could' or
104 w == 'could\'ve' or
105 w == 'couldve' or
106 w == 'coulda' or
107 w == 'cudda' or
108 w == 'culda' or
109 w == 'cudve' or
110 w == 'must' or
111 w == 'mustve' or
112 w == 'might' or
113 w == 'might\'ve' or
114 w == 'mightve' or
115 w == 'ought' or
116 w == 'may' or
117 w == 'i\'d' or
118 w == 'id' or
119 w == 'we\'d' or
120 w == 'youd' or
121 w == 'you\'d' or
122 w == 'he\'d' or
123 w == 'she\'d'):
124 return True
125 return False
126
127 def tag_CCJ(word):
128 w = word.lower()
129 '''
130 as long as, even if, if, one condition that, provided (that),
131 providing (that), so long as, unless, whether... or, supposing,
132 suppose, imagine, but for
133 '''
134 if(w == 'as' or
135 w == 'if' or
136 w == 'even' or
137 w == 'provided' or
138 w == 'providing' or
139 w == 'suppose' or
140 w == 'supposing' or
141 w == 'unless' or
142 w == 'whether' or
143 w == 'envision' or
144 w == 'envisioning' or
145 w == 'conceptualize'or
146 w == 'conceptualizing' or
147 w == 'conjure' or
148 w == 'conjuring' or
149 w == 'visualize' or
150 w == 'visualizing'):
151 return True
152 return False
153
154 def get_tagged_message(message, tagger):
155 tagset = None
156 formatted_message = format_tweet(message)
157 tokens = twokenize.tokenize(formatted_message)
158 tags = nltk.tag._pos_tag(tokens, tagset, tagger)
159 return format_tagged(tags)
160
161 def classify(tweetfile, taggedfile):
162 tweetfile = open(tweetfile, "r")
163 taggedfile = open(taggedfile, "w")
164 counterfactuals = open('counterfactuals.txt', 'w')
165
166 tagger = PerceptronTagger()
167 form_num = 8
168
169 cf_count = [[0 for x in range(form_num)] for x in range(form_num)]
170
171 form_vec = []
172
173 print("Reading file...")
174 tweet = tweetfile.readline()
175
176 while tweet:
177 taggedTweet = get_tagged_message(tweet, tagger)
178 taggedfile.write(taggedTweet)
179 form = int(get_cf_form(taggedTweet))
180
181 if form:
182 print(tweet)
183 counterfactuals.write(tweet + '<hr>\n')
184
185 form_vec.append(form)
186 cf_count[form][0] += 1
187 tweet = tweetfile.readline()
188
189 count = 0
190 for i in xrange(1, form_num):
191 count += cf_count[i][0]
192
193 print("Finished tagging...")
194 tweetfile.close()
195 taggedfile.close()
196
197 print("counterfactuals: " + str(count) + "/100")
198 counterfactuals.write("counterfactuals: " + str(count) + "/100<br>\n")
199 counterfactuals.close()