#! /usr/bin/env python # -*- python -*- # reuters - v3 import re from nltk import * from nltk.corpus import reuters filenames = reuters.files() all_filenames = [] my_pos_filename_dict = {} for filename in filenames: if re.compile('training/1[1234]$').match(filename): doc_categories = reuters.categories(filename) if doc_categories.count('acq') > 0: my_pos_filename_dict[filename] = 1 all_filenames.append(filename) else: all_filenames.append(filename) # build vocabulary all_vocab = set() for filename in all_filenames: my_file = reuters.open(filename) my_tokens = my_file.read().split() my_vocab = set(my_tokens) all_vocab = all_vocab.union(my_vocab) # assign index to each vocab item my_dict = {} my_index = 1 my_rev_dict = {} for w in all_vocab: my_dict[w] = my_index my_rev_dict[my_index] = w my_index += 1 for filename in all_filenames: my_file = reuters.open(filename) my_text = my_file.read() fd = nltk.FreqDist(word for word in my_text.split()) entries = [] for sample in fd.samples(): entries.append(my_dict[sample]) entries.sort() # compile output str = "1" if filename in my_pos_filename_dict else "-1" for entry in entries: str += ' %d:%d' % (entry, fd[my_rev_dict[entry]]) str += " # %s" % (filename) print str