好吧,让我们假设你从这个开始 是为了测试的目的:
class Rated(object):
def __init__(self, rating, freq_dist):
self.rating = rating
self.freq_dist = freq_dist
a = Rated(5, nltk.FreqDist( the boy sees the dog .split()))
b = Rated(8, nltk.FreqDist( the cat sees the mouse .split()))
c = Rated(-3, nltk.FreqDist( some boy likes nothing .split()))
trainingTexts = [a,b,c]
那么你的代码会看起来像:
from collections import defaultdict
from operator import itemgetter
# dictionaries for keeping track of the counts
pos_dict = defaultdict(int)
neg_dict = defaultdict(int)
for r in trainingTexts:
rating = r.rating
freq = r.freq_dist
# choose the appropriate counts dict
if rating > 0:
partition = pos_dict
elif rating < 0:
partition = neg_dict
else:
continue
# add the information to the correct counts dict
for word,count in freq.iteritems():
partition[word] += count
# Turn the counts dictionaries into lists of descending-frequency words
def only_list(counts, filtered):
return sorted(filter(lambda (w,c): w not in filtered, counts.items()),
key=itemgetter(1),
reverse=True)
only_positive_words = only_list(pos_dict, neg_dict)
only_negative_words = only_list(neg_dict, pos_dict)
结果就是:
>>> only_positive_words
[( the , 4), ( sees , 2), ( dog , 1), ( cat , 1), ( mouse , 1)]
>>> only_negative_words
[( nothing , 1), ( some , 1), ( likes , 1)]