This notebook cross-validates the CLTK's part-of-speech taggers. The final results are found at the very bottom.
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag import AffixTagger
from nltk.tag import BigramTagger
from nltk.tag import tnt
from nltk.tag import TrigramTagger
from nltk.tag import UnigramTagger
from nltk.tokenize import wordpunct_tokenize
import math
import os
import pandas as pd
import random
from statistics import mean
from statistics import stdev
full_training_set_rel = '~/greek_treebank_perseus/pos_training_set.pos'
full_training_set = os.path.expanduser(full_training_set_rel)
# This section's code is good, but it times out in IPython.
# Consider using it in separate scripts.
unigram_accuracies = []
bigram_accuracies = []
trigram_accuracies = []
backoff_accuracies = []
two_prefix_accuracies = []
three_prefix_accuracies = []
four_prefix_accuracies = []
two_suffix_accuracies = []
three_suffix_accuracies = []
four_suffix_accuracies = []
five_suffix_accuracies = []
six_suffix_accuracies = []
tnt_accuracies = []
with open(full_training_set) as f:
training_set_string = f.read()
pos_set = training_set_string.split('\n\n') # mk into a list
sentence_count = len(pos_set)
tenth = math.ceil(int(sentence_count) / int(10))
random.shuffle(pos_set)
def chunks(l, n):
"""Yield successive n-sized chunks from l.
http://stackoverflow.com/a/312464
"""
for i in range(0, len(l), n):
yield l[i:i+n]
# a list of 10 lists
ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with 695 sentences each
#for counter in list(range(10)):
for counter, part in list(enumerate(ten_parts)):
# map test list to part of given loop
test_set = ten_parts[counter] # or: test_set = part
# filter out this loop's test index
training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
# next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
training_set = [item for sublist in training_set_lists for item in sublist]
# save shuffled tests to file
# there might be a way of getting
local_dir_rel = '~/cltk_data/user_data'
local_dir = os.path.expanduser(local_dir_rel)
if not os.path.isdir(local_dir):
os.makedirs(local_dir)
test_path = os.path.join(local_dir, 'test_greek.pos')
with open(test_path, 'w') as f:
f.write('\n\n'.join(test_set))
train_path = os.path.join(local_dir, 'train_greek.pos')
with open(train_path, 'w') as f:
f.write('\n\n'.join(training_set))
# read POS corpora
train_reader = TaggedCorpusReader(local_dir, 'train_greek.pos')
train_sents = train_reader.tagged_sents()
test_reader = TaggedCorpusReader(local_dir, 'test_greek.pos')
test_sents = test_reader.tagged_sents()
print('Loop #' + str(counter))
# make unigram tagger
unigram_tagger = UnigramTagger(train_sents)
# evaluate unigram tagger
unigram_accuracy = None
unigram_accuracy = unigram_tagger.evaluate(test_sents)
unigram_accuracies.append(unigram_accuracy)
print('Unigram:', unigram_accuracy)
# make bigram tagger
bigram_tagger = BigramTagger(train_sents)
# evaluate bigram tagger
bigram_accuracy = None
bigram_accuracy = bigram_tagger.evaluate(test_sents)
bigram_accuracies.append(bigram_accuracy)
print('Bigram:', bigram_accuracy)
# make trigram tagger
trigram_tagger = TrigramTagger(train_sents)
# evaluate trigram tagger
trigram_accuracy = None
trigram_accuracy = trigram_tagger.evaluate(test_sents)
trigram_accuracies.append(trigram_accuracy)
print('Trigram:', trigram_accuracy)
# make 1, 2, 3-gram backoff tagger
tagger1 = UnigramTagger(train_sents)
tagger2 = BigramTagger(train_sents, backoff=tagger1)
tagger3 = TrigramTagger(train_sents, backoff=tagger2)
# evaluate trigram tagger
backoff_accuracy = None
backoff_accuracy = tagger3.evaluate(test_sents)
backoff_accuracies.append(backoff_accuracy)
print('1, 2, 3-gram backoff:', backoff_accuracy)
# make 2-char prefix tagger
two_prefix_tagger = AffixTagger(train_sents, affix_length=2)
# evaluate 2-char prefix tagger
two_prefix_accuracy = None
two_prefix_accuracy = two_prefix_tagger.evaluate(test_sents)
two_prefix_accuracies.append(two_prefix_accuracy)
print('2-char prefix:', two_prefix_accuracy)
# make 3-char prefix tagger
three_prefix_tagger = AffixTagger(train_sents, affix_length=3)
# evaluate 3-char prefix tagger
three_prefix_accuracy = None
three_prefix_accuracy = three_prefix_tagger.evaluate(test_sents)
three_prefix_accuracies.append(three_prefix_accuracy)
print('3-char prefix:', three_prefix_accuracy)
# make 4-char prefix tagger
four_prefix_tagger = AffixTagger(train_sents, affix_length=4)
# evaluate 4-char prefix tagger
four_prefix_accuracy = None
four_prefix_accuracy = four_prefix_tagger.evaluate(test_sents)
four_prefix_accuracies.append(four_prefix_accuracy)
print('4-char prefix:', four_prefix_accuracy)
# make 2-char suffix tagger
two_suffix_tagger = AffixTagger(train_sents, affix_length=2)
# evaluate 2-char suffix tagger
two_suffix_accuracy = None
two_suffix_accuracy = two_suffix_tagger.evaluate(test_sents)
two_suffix_accuracies.append(two_suffix_accuracy)
print('2-char suffix:', two_suffix_accuracy)
# make 3-char suffix tagger
three_suffix_tagger = AffixTagger(train_sents, affix_length=3)
# evaluate 3-char suffix tagger
three_suffix_accuracy = None
three_suffix_accuracy = three_suffix_tagger.evaluate(test_sents)
three_suffix_accuracies.append(three_suffix_accuracy)
print('3-char suffix:', three_suffix_accuracy)
# make 4-char suffix tagger
four_suffix_tagger = AffixTagger(train_sents, affix_length=4)
# evaluate 4-char suffix tagger
four_suffix_accuracy = None
four_suffix_accuracy = four_suffix_tagger.evaluate(test_sents)
four_suffix_accuracies.append(four_suffix_accuracy)
print('4-char suffix:', four_suffix_accuracy)
# make 5-char suffix tagger
five_suffix_tagger = AffixTagger(train_sents, affix_length=5)
# evaluate 5-char suffix tagger
five_suffix_accuracy = None
five_suffix_accuracy = five_suffix_tagger.evaluate(test_sents)
five_suffix_accuracies.append(five_suffix_accuracy)
print('5-char suffix:', five_suffix_accuracy)
# make 6-char suffix tagger
six_suffix_tagger = AffixTagger(train_sents, affix_length=6)
# evaluate 6-char suffix tagger
six_suffix_accuracy = None
six_suffix_accuracy = six_suffix_tagger.evaluate(test_sents)
six_suffix_accuracies.append(six_suffix_accuracy)
print('6-char suffix:', six_suffix_accuracy)
# make tnt tagger
tnt_tagger = tnt.TnT(N=100) # N=1000 is default but but won't finish with Greek
tnt_tagger.train(train_sents)
# evaulate tnt tagger
tnt_accuracy = None
tnt_accuracy = tnt_tagger.evaluate(test_sents)
tnt_accuracies.append(tnt_accuracy)
print('TnT:', tnt_accuracy)
# This code was not run in IPython, but necessary for assembling statistics.
final_accuracies_list = []
mean_accuracy_unigram = mean(unigram_accuracies)
standard_deviation_unigram = stdev(unigram_accuracies)
uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
final_accuracies_list.append(uni)
mean_accuracy_bigram = mean(bigram_accuracies)
standard_deviation_bigram = stdev(bigram_accuracies)
bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
final_accuracies_list.append(bi)
mean_accuracy_trigram = mean(trigram_accuracies)
standard_deviation_trigram = stdev(trigram_accuracies)
tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
final_accuracies_list.append(tri)
mean_accuracy_backoff = mean(backoff_accuracies)
standard_deviation_backoff = stdev(backoff_accuracies)
back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
final_accuracies_list.append(back)
mean_accuracy_two_prefix = mean(two_prefix_accuracies)
standard_deviation_two_prefix = stdev(two_prefix_accuracies)
two_pre = {'2 prefix': {'mean': mean_accuracy_two_prefix, 'sd': standard_deviation_two_prefix}}
final_accuracies_list.append(two_pre)
mean_accuracy_three_prefix = mean(three_prefix_accuracies)
standard_deviation_three_prefix = stdev(three_prefix_accuracies)
three_pre = {'3 prefix': {'mean': mean_accuracy_three_prefix, 'sd': standard_deviation_three_prefix}}
final_accuracies_list.append(three_pre)
mean_accuracy_four_prefix = mean(four_prefix_accuracies)
standard_deviation_four_prefix = stdev(four_prefix_accuracies)
four_pre = {'4 prefix': {'mean': mean_accuracy_four_prefix, 'sd': standard_deviation_four_prefix}}
final_accuracies_list.append(four_pre)
mean_accuracy_two_suffix = mean(two_suffix_accuracies)
standard_deviation_two_suffix = stdev(two_suffix_accuracies)
two_suff= {'2 suffix': {'mean': mean_accuracy_two_suffix, 'sd': standard_deviation_two_suffix}}
final_accuracies_list.append(two_suff)
mean_accuracy_three_suffix = mean(three_suffix_accuracies)
standard_deviation_three_suffix = stdev(three_suffix_accuracies)
three_suff = {'3 suffix': {'mean': mean_accuracy_three_suffix, 'sd': standard_deviation_three_suffix}}
final_accuracies_list.append(three_suff)
mean_accuracy_four_suffix = mean(four_suffix_accuracies)
standard_deviation_four_suffix = stdev(four_suffix_accuracies)
four_suff = {'4 suffix': {'mean': mean_accuracy_four_suffix, 'sd': standard_deviation_four_suffix}}
final_accuracies_list.append(four_suff)
mean_accuracy_five_suffix = mean(five_suffix_accuracies)
standard_deviation_five_suffix = stdev(five_suffix_accuracies)
five_suff = {'5 suffix': {'mean': mean_accuracy_five_suffix, 'sd': standard_deviation_five_suffix}}
final_accuracies_list.append(five_suff)
mean_accuracy_six_suffix = mean(six_suffix_accuracies)
standard_deviation_six_suffix = stdev(six_suffix_accuracies)
six_suff = {'6 suffix': {'mean': mean_accuracy_six_suffix, 'sd': standard_deviation_six_suffix}}
final_accuracies_list.append(six_suff)
# tnt values for 8/10
# [0.9553020305648972, 0.9542076240709662, 0.9564495709663806, 0.9546379227530868, 0.9518869884357039, 0.95546875, 0.9559369923006287, 0.9545558706424909]
#In [25]: mean
#Out[25]: 0.9548057187167692
mean_accuracy_tnt = mean(tnt_accuracies)
standard_deviation_tnt = stdev(tnt_accuracies)
tnt = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
final_accuracies_list.append(tnt)
# added to capture output when running as script
'''
with open('final_accuracies.py', 'w') as f:
f.write(final_dict)
'''
# Note: these values were generated from the above code,
# but through the above code run as a script, as IPython was timing out.
final_dict = {'trigram': {'mean': 0.7259902681754967, 'sd': 0.008456454235275485}, 'unigram': {'mean': 0.9051637750829489, 'sd': 0.0018505984564185247}, '1, 2, 3-gram backoff': {'mean': 0.952937576511621, 'sd': 0.0023986408561536845}, 'bigram': {'mean': 0.7366758737449184, 'sd': 0.007628422048101589}, 'tnt': {'mean': 0.9549570274227535, 'sd': 0.001320804203349713}}
'''
final_dict = {}
for x in final_accuracies_list:
final_dict.update(x)
'''
df = pd.DataFrame(final_dict)
df