This notebook cross-validates the CLTK's part-of-speech taggers. The final results are found at the very bottom.

In [3]:
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tag import AffixTagger
from nltk.tag import BigramTagger
from nltk.tag import tnt
from nltk.tag import TrigramTagger
from nltk.tag import UnigramTagger
from nltk.tokenize import wordpunct_tokenize
import math
import os
import pandas as pd
import random
from statistics import mean
from statistics import stdev
In [2]:
full_training_set_rel = '~/greek_treebank_perseus/pos_training_set.pos'
full_training_set = os.path.expanduser(full_training_set_rel)
In []:
# This section's code is good, but it times out in IPython.
# Consider using it in separate scripts.
unigram_accuracies = []
bigram_accuracies = []
trigram_accuracies = []
backoff_accuracies = []
two_prefix_accuracies = []
three_prefix_accuracies = []
four_prefix_accuracies = []
two_suffix_accuracies = []
three_suffix_accuracies = []
four_suffix_accuracies = []
five_suffix_accuracies = []
six_suffix_accuracies = []
tnt_accuracies = []

with open(full_training_set) as f:
    training_set_string = f.read()
    pos_set = training_set_string.split('\n\n')  # mk into a list

sentence_count = len(pos_set)
tenth = math.ceil(int(sentence_count) / int(10))

random.shuffle(pos_set)

def chunks(l, n):
    """Yield successive n-sized chunks from l.
    http://stackoverflow.com/a/312464
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

# a list of 10 lists
ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with 695 sentences each

#for counter in list(range(10)):
for counter, part in list(enumerate(ten_parts)):
    # map test list to part of given loop
    test_set = ten_parts[counter]  # or: test_set = part
    
    # filter out this loop's test index
    training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
    
    # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
    training_set = [item for sublist in training_set_lists for item in sublist]
        
    # save shuffled tests to file
    # there might be a way of getting 
    local_dir_rel = '~/cltk_data/user_data'
    local_dir = os.path.expanduser(local_dir_rel)
    if not os.path.isdir(local_dir):
        os.makedirs(local_dir)

    test_path = os.path.join(local_dir, 'test_greek.pos')
    with open(test_path, 'w') as f:
        f.write('\n\n'.join(test_set))

    train_path = os.path.join(local_dir, 'train_greek.pos')
    with open(train_path, 'w') as f:
        f.write('\n\n'.join(training_set))

    # read POS corpora
    train_reader = TaggedCorpusReader(local_dir, 'train_greek.pos')
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_greek.pos')
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    # make unigram tagger
    unigram_tagger = UnigramTagger(train_sents)
    # evaluate unigram tagger
    unigram_accuracy = None
    unigram_accuracy = unigram_tagger.evaluate(test_sents)
    unigram_accuracies.append(unigram_accuracy)
    print('Unigram:', unigram_accuracy)
    
    # make bigram tagger
    bigram_tagger = BigramTagger(train_sents)
    # evaluate bigram tagger
    bigram_accuracy = None
    bigram_accuracy = bigram_tagger.evaluate(test_sents)
    bigram_accuracies.append(bigram_accuracy)
    print('Bigram:', bigram_accuracy)
    
    # make trigram tagger
    trigram_tagger = TrigramTagger(train_sents)
    # evaluate trigram tagger
    trigram_accuracy = None
    trigram_accuracy = trigram_tagger.evaluate(test_sents)
    trigram_accuracies.append(trigram_accuracy)
    print('Trigram:', trigram_accuracy)
    
    # make 1, 2, 3-gram backoff tagger
    tagger1 = UnigramTagger(train_sents)
    tagger2 = BigramTagger(train_sents, backoff=tagger1)
    tagger3 = TrigramTagger(train_sents, backoff=tagger2)
    # evaluate trigram tagger
    backoff_accuracy = None
    backoff_accuracy = tagger3.evaluate(test_sents)
    backoff_accuracies.append(backoff_accuracy)
    print('1, 2, 3-gram backoff:', backoff_accuracy)
    
    # make 2-char prefix tagger
    two_prefix_tagger = AffixTagger(train_sents, affix_length=2)
    # evaluate 2-char prefix tagger
    two_prefix_accuracy = None
    two_prefix_accuracy = two_prefix_tagger.evaluate(test_sents)
    two_prefix_accuracies.append(two_prefix_accuracy)
    print('2-char prefix:', two_prefix_accuracy)
    
    # make 3-char prefix tagger
    three_prefix_tagger = AffixTagger(train_sents, affix_length=3)
    # evaluate 3-char prefix tagger
    three_prefix_accuracy = None
    three_prefix_accuracy = three_prefix_tagger.evaluate(test_sents)
    three_prefix_accuracies.append(three_prefix_accuracy)
    print('3-char prefix:', three_prefix_accuracy)

    # make 4-char prefix tagger
    four_prefix_tagger = AffixTagger(train_sents, affix_length=4)
    # evaluate 4-char prefix tagger
    four_prefix_accuracy = None
    four_prefix_accuracy = four_prefix_tagger.evaluate(test_sents)
    four_prefix_accuracies.append(four_prefix_accuracy)
    print('4-char prefix:', four_prefix_accuracy)

    # make 2-char suffix tagger
    two_suffix_tagger = AffixTagger(train_sents, affix_length=2)
    # evaluate 2-char suffix tagger
    two_suffix_accuracy = None
    two_suffix_accuracy = two_suffix_tagger.evaluate(test_sents)
    two_suffix_accuracies.append(two_suffix_accuracy)
    print('2-char suffix:', two_suffix_accuracy)
    
    # make 3-char suffix tagger
    three_suffix_tagger = AffixTagger(train_sents, affix_length=3)
    # evaluate 3-char suffix tagger
    three_suffix_accuracy = None
    three_suffix_accuracy = three_suffix_tagger.evaluate(test_sents)
    three_suffix_accuracies.append(three_suffix_accuracy)
    print('3-char suffix:', three_suffix_accuracy)

    # make 4-char suffix tagger
    four_suffix_tagger = AffixTagger(train_sents, affix_length=4)
    # evaluate 4-char suffix tagger
    four_suffix_accuracy = None
    four_suffix_accuracy = four_suffix_tagger.evaluate(test_sents)
    four_suffix_accuracies.append(four_suffix_accuracy)
    print('4-char suffix:', four_suffix_accuracy)

    # make 5-char suffix tagger
    five_suffix_tagger = AffixTagger(train_sents, affix_length=5)
    # evaluate 5-char suffix tagger
    five_suffix_accuracy = None
    five_suffix_accuracy = five_suffix_tagger.evaluate(test_sents)
    five_suffix_accuracies.append(five_suffix_accuracy)
    print('5-char suffix:', five_suffix_accuracy)

    # make 6-char suffix tagger
    six_suffix_tagger = AffixTagger(train_sents, affix_length=6)
    # evaluate 6-char suffix tagger
    six_suffix_accuracy = None
    six_suffix_accuracy = six_suffix_tagger.evaluate(test_sents)
    six_suffix_accuracies.append(six_suffix_accuracy)
    print('6-char suffix:', six_suffix_accuracy)
    
    # make tnt tagger
    tnt_tagger = tnt.TnT(N=100)  # N=1000 is default but but won't finish with Greek
    tnt_tagger.train(train_sents)
    # evaulate tnt tagger
    tnt_accuracy = None
    tnt_accuracy = tnt_tagger.evaluate(test_sents)
    tnt_accuracies.append(tnt_accuracy)
    print('TnT:', tnt_accuracy)
In []:
# This code was not run in IPython, but necessary for assembling statistics.
final_accuracies_list = []
mean_accuracy_unigram = mean(unigram_accuracies)
standard_deviation_unigram = stdev(unigram_accuracies)
uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
final_accuracies_list.append(uni)

mean_accuracy_bigram = mean(bigram_accuracies)
standard_deviation_bigram = stdev(bigram_accuracies)
bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
final_accuracies_list.append(bi)

mean_accuracy_trigram = mean(trigram_accuracies)
standard_deviation_trigram = stdev(trigram_accuracies)
tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
final_accuracies_list.append(tri)

mean_accuracy_backoff = mean(backoff_accuracies)
standard_deviation_backoff = stdev(backoff_accuracies)
back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
final_accuracies_list.append(back)

mean_accuracy_two_prefix = mean(two_prefix_accuracies)
standard_deviation_two_prefix = stdev(two_prefix_accuracies)
two_pre = {'2 prefix': {'mean': mean_accuracy_two_prefix, 'sd': standard_deviation_two_prefix}}
final_accuracies_list.append(two_pre)

mean_accuracy_three_prefix = mean(three_prefix_accuracies)
standard_deviation_three_prefix = stdev(three_prefix_accuracies)
three_pre = {'3 prefix': {'mean': mean_accuracy_three_prefix, 'sd': standard_deviation_three_prefix}}
final_accuracies_list.append(three_pre)

mean_accuracy_four_prefix = mean(four_prefix_accuracies)
standard_deviation_four_prefix = stdev(four_prefix_accuracies)
four_pre = {'4 prefix': {'mean': mean_accuracy_four_prefix, 'sd': standard_deviation_four_prefix}}
final_accuracies_list.append(four_pre)

mean_accuracy_two_suffix = mean(two_suffix_accuracies)
standard_deviation_two_suffix = stdev(two_suffix_accuracies)
two_suff= {'2 suffix': {'mean': mean_accuracy_two_suffix, 'sd': standard_deviation_two_suffix}}
final_accuracies_list.append(two_suff)

mean_accuracy_three_suffix = mean(three_suffix_accuracies)
standard_deviation_three_suffix = stdev(three_suffix_accuracies)
three_suff = {'3 suffix': {'mean': mean_accuracy_three_suffix, 'sd': standard_deviation_three_suffix}}
final_accuracies_list.append(three_suff)

mean_accuracy_four_suffix = mean(four_suffix_accuracies)
standard_deviation_four_suffix = stdev(four_suffix_accuracies)
four_suff = {'4 suffix': {'mean': mean_accuracy_four_suffix, 'sd': standard_deviation_four_suffix}}
final_accuracies_list.append(four_suff)

mean_accuracy_five_suffix = mean(five_suffix_accuracies)
standard_deviation_five_suffix = stdev(five_suffix_accuracies)
five_suff = {'5 suffix': {'mean': mean_accuracy_five_suffix, 'sd': standard_deviation_five_suffix}}
final_accuracies_list.append(five_suff)

mean_accuracy_six_suffix = mean(six_suffix_accuracies)
standard_deviation_six_suffix = stdev(six_suffix_accuracies)
six_suff = {'6 suffix': {'mean': mean_accuracy_six_suffix, 'sd': standard_deviation_six_suffix}}
final_accuracies_list.append(six_suff)

# tnt values for 8/10
# [0.9553020305648972, 0.9542076240709662, 0.9564495709663806, 0.9546379227530868, 0.9518869884357039, 0.95546875, 0.9559369923006287, 0.9545558706424909]
#In [25]: mean
#Out[25]: 0.9548057187167692

mean_accuracy_tnt = mean(tnt_accuracies)
standard_deviation_tnt = stdev(tnt_accuracies)
tnt = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
final_accuracies_list.append(tnt)


# added to capture output when running as script
'''
with open('final_accuracies.py', 'w') as f:
    f.write(final_dict)
'''
In [4]:
# Note: these values were generated from the above code, 
# but through the above code run as a script, as IPython was timing out.
final_dict = {'trigram': {'mean': 0.7259902681754967, 'sd': 0.008456454235275485}, 'unigram': {'mean': 0.9051637750829489, 'sd': 0.0018505984564185247}, '1, 2, 3-gram backoff': {'mean': 0.952937576511621, 'sd': 0.0023986408561536845}, 'bigram': {'mean': 0.7366758737449184, 'sd': 0.007628422048101589}, 'tnt': {'mean': 0.9549570274227535, 'sd': 0.001320804203349713}}
In [5]:
'''
final_dict = {}
for x in final_accuracies_list:
    final_dict.update(x)
'''

df = pd.DataFrame(final_dict)
df
Out[5]:
1, 2, 3-gram backoff bigram tnt trigram unigram
mean 0.952938 0.736676 0.954957 0.725990 0.905164
sd 0.002399 0.007628 0.001321 0.008456 0.001851
In []: