hw08_nltk - Analyze PDF

Title	hw08_nltk - Analyze
Course	Symbolischen Programmiersprache
Institution	Ludwig-Maximilians-Universität München
Pages	2
File Size	36.2 KB
File Type	PDF
Total Downloads	107
Total Views	155

Preview

CLICK TO PREVIEW PDF

Summary

Lösung zur Programmieraufgabe hw08 (1/2)...

Description

from nltk import FreqDist from nltk import word_tokenize import collections class Analyzer(object): def __init__(self, path): '''reads the file text, creates the list of words (use nltk.word_tokenize to tokenize the text), and calculates frequency distribution ''' with open(path, 'r') as myfile: text = myfile.read().strip() self.text = word_tokenize(text) self.token_counts = FreqDist(self.text) def numberOfTokens(self): '''returns number of tokens in the text ''' return len(self.text) def vocabularySize(self): '''returns a list of the vocabulary of the text ''' return len(self.token_counts) def lexicalDiversity(self): '''returns the lexical diversity of the text ''' return len(self.text)/len(self.token_counts) def getKeywords(self): '''return words as possible key words, that are longer than seven characters, that occur more than seven times (sorted alphabetically)''' a = [] for word, count in self.token_counts.items(): if count > 7: if len(word) > 7: a.append(word) return sorted(a) def numberOfHapaxes(self): '''returns the number of hapaxes in the text''' a = [] for word, count in self.token_counts.items(): if count == 1: a.append(word) return len(a) def avWordLength(self): '''returns the average word length of the text''' a = 0 for word, count in self.token_counts.items(): a += len(word) return a/len(self.token_counts) def topSuffixes(self): '''returns the 10 most frequent 2-letter suffixes in words (restrict to words of length 5 or more)''' words = [word for word in self.text if len(word) > 4] a = list(dict.fromkeys(words)) suffixes = [word[-2:] for word in a] freq_suffs = (collections.Counter(suffixes).most_common(10)) return [x for (x,y) in freq_suffs]

def topPrefixes(self): '''returns the 10 most frequent 2-letter prefixes in words (restrict to words of length 5 or more)''' words = [word for word in self.text if len(word) > 4] a = list(dict.fromkeys(words)) prefixes = [word[:2] for word in a] freq_prefs = (collections.Counter(prefixes).most_common(10)) return [x for (x,y) in freq_prefs] def tokensTypical(self): '''returns first 5 tokens of the (alphabetically sorted) vocabulary that contain both often seen prefixes and suffixes in the text. As in topPrefixes() and topSuffixes(), Prefixes and Suffixes are 2 characters long.''' voc = sorted(set(self.text)) a = [] for v in voc: for i in self.topPrefixes(): if v.startswith(i): for j in self.topSuffixes(): if v.endswith(j): a.append(v) return a[:5]...