-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse.py
More file actions
163 lines (141 loc) · 5.58 KB
/
parse.py
File metadata and controls
163 lines (141 loc) · 5.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import csv
from pprint import pprint
from sklearn import svm
from nltk.tokenize import sent_tokenize as ntlk_sent_tokenize
from nltk.tokenize import word_tokenize as ntlk_word_tokenize
import nltk
from nltk.corpus import brown
import random
from filecache import filecache
words_file = open('/usr/share/dict/words', 'r')
ALL_ENGLISH_WORDS = set(line.strip() for line in words_file)
words_file.close()
@filecache()
def load_brown_freq_ratios():
brown_freqdist = nltk.FreqDist([w.lower() for w in brown.words()])
num_words = len(brown.words())
ratios = {}
for word, number in brown_freqdist.iteritems():
ratios[word] = float(number)/num_words
return ratios
brown_freq_ratios = load_brown_freq_ratios()
def guess(in_essays, withold=1):
features = [make_features(in_essay) for in_essay in in_essays]
scores = [in_essay.normalized_rating() for in_essay in in_essays]
clf = svm.SVR()
clf.fit(features[withold:], scores[withold:])
svm.SVR(C=1.0, coef0=0.0, degree=3, epsilon=0.1, gamma=0.5,
kernel='rbf', probability=False, shrinking=True,
tol=0.001)
predictions = clf.predict(features[:withold])
return zip(predictions, in_essays[:withold])
class Essay(object):
max_ratings_per_type = {
1:12,
3:3,
4:3,
5:4,
6:4,
7:30,
8:60
}
def __init__(self, in_row):
self.essay_id = int(in_row[0] or -1)
self.set_id = int(in_row[1])
self.essay = in_row[2]
self.domain1_score = int(in_row[6])
def max_rating(self):
max_rating = Essay.max_ratings_per_type[self.set_id]
assert self.domain1_score <= max_rating, 'score %s was higher than max %s for type %s' % (self.domain1_score, max_rating, self.set_id)
return max_rating
def normalized_rating(self):
return float(self.domain1_score) / self.max_rating()
def __repr__(self):
return "<Essay Normed: %f2 Scored: %s/%s \"%s...\">" % (self.normalized_rating(), self.domain1_score, self.max_rating(), self.essay[:50])
def make_features(in_essay):
features = []
features.append(Feature.num_unique_words(in_essay))
features.append(Feature.ratio_unique_words(in_essay))
features.append(Feature.average_word_length(in_essay))
features.append(Feature.period_ratio(in_essay))
features.append(Feature.comma_ratio(in_essay))
features.append(Feature.ratio_dict_words(in_essay))
features.append(Feature.average_sentence_length(in_essay))
features.append(Feature.num_unique_misspellings(in_essay))
features.append(Feature.brown_freq_diff(in_essay))
return features
class Feature:
@staticmethod
def num_unique_words(in_essay):
return len(set(tokenize(in_essay)))
@staticmethod
def ratio_unique_words(in_essay):
return float(len(tokenize(in_essay))) / Feature.num_unique_words(in_essay)
@staticmethod
def average_word_length(in_essay):
lengths = tuple(len(word) for word in tokenize(in_essay) if not word.startswith("@"))
return float(sum(lengths)) / len(lengths)
@staticmethod
def period_ratio(in_essay):
return float(in_essay.essay.count('.')) / len(in_essay.essay)
@staticmethod
def comma_ratio(in_essay):
return float(in_essay.essay.count(',')) / len(in_essay.essay)
@staticmethod
def ratio_dict_words(in_essay):
words = [w for w in tokenize(in_essay) if not w.startswith("@")]
words_in_dict = [w for w in words if w in ALL_ENGLISH_WORDS]
return float(len(words_in_dict))/len(words)
@staticmethod
def num_unique_misspellings(in_essay):
misspelled_words = [w for w in tokenize(in_essay) if not w.startswith("@") and w not in ALL_ENGLISH_WORDS]
return len(set(misspelled_words))
@staticmethod
def average_sentence_length(in_essay):
lengths = [len(sentence) for sentence in sent_tokenize(in_essay)]
return float(sum(lengths))/len(lengths)
@staticmethod
def brown_freq_diff(in_essay):
in_dist = nltk.FreqDist([w for w in tokenize(in_essay) if not w.startswith("@")])
diffs = []
for word in tokenize(in_essay):
ratio = float(in_dist[word])/len(tokenize(in_essay))
diffs.append(abs(ratio - brown_freq_ratios.get(word, 0)))
avg = sum(diffs)/len(diffs)
return avg
def tokenize(in_essay):
if not hasattr(tokenize, 'cache'):
tokenize.cache = {}
if in_essay in tokenize.cache:
return tokenize.cache[in_essay]
out_tokens = tuple(i.lower() for i in ntlk_word_tokenize(in_essay.essay))
out_tokens = tuple(i.lower() for i in in_essay.essay.split())
tokenize.cache[in_essay] = out_tokens
return out_tokens
def sent_tokenize(in_essay):
if not hasattr(tokenize, 'cache'):
tokenize.cache = {}
if in_essay in tokenize.cache:
return tokenize.cache[in_essay]
out_tokens = ntlk_sent_tokenize(in_essay.essay)
tokenize.cache[in_essay] = out_tokens
return out_tokens
@filecache()
def parse(in_filename):
in_file = open(in_filename, 'r')
reader = csv.reader(in_file, delimiter='\t')
essays = []
for i, row in enumerate(reader):
if i==0: continue
this_essay = Essay(row)
if this_essay.set_id != 2:
essays.append(this_essay)
in_file.close()
return essays
if __name__ == '__main__':
essays = parse("training/training_set_rel3.tsv")
random.shuffle(essays)
guesses = guess(essays, withold=50)
pprint(guesses)
diffs = tuple(abs(round(guess) - essay.normalized_rating()) for guess, essay in guesses)
print "Average Difference:", sum(diffs)/len(diffs)