#! /usr/bin/env python # -*- python -*- # reuters - v4 import re from nltk import * from nltk.corpus import reuters filenames = reuters.files() all_filenames = [] my_pos_filename_dict = {} for filename in filenames: if re.compile('training/\d\d$').match(filename): doc_categories = reuters.categories(filename) if doc_categories.count('acq') > 0: my_pos_filename_dict[filename] = 1 all_filenames.append(filename) else: all_filenames.append(filename) # build vocabulary all_vocab = set() stemmer = PorterStemmer() for filename in all_filenames: my_file = reuters.open(filename) my_tokens = my_file.read().lower().split() # <<<< LOWER my_stems = [stemmer.stem(token) for token in my_tokens] my_vocab = set(my_stems) all_vocab = all_vocab.union(my_vocab) # assign index to each vocab item my_dict = {} my_index = 1 my_rev_dict = {} for w in all_vocab: my_dict[w] = my_index my_rev_dict[my_index] = w my_index += 1 for filename in all_filenames: my_file = reuters.open(filename) my_text = my_file.read() lowered = [word.lower() for word in my_text.split()] fd = nltk.FreqDist([stemmer.stem(low) for low in lowered]) entries = [] for sample in fd.samples(): entries.append(my_dict[sample]) entries.sort() # compile output str = "1" if filename in my_pos_filename_dict else "-1" for entry in entries: str += ' %d:%d' % (entry, fd[my_rev_dict[entry]]) str += " # %s" % (filename) print str