In [1]:
from cltk.stem.lemma import LemmaReplacer
from cltk.stem.latin.j_v import JVReplacer
from cltk.stop.latin.stops import STOPS_LIST
from collections import Counter
from nltk.tokenize.punkt import PunktWordTokenizer
from nltk.tokenize import RegexpTokenizer
import numpy
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
In [2]:
# lemmatizing is optional and probably not useful 
# in this tf-idf case
#lemmatizer = LemmaReplacer('latin')
In [3]:
def build_phi5_index():
    """Returns dict of {file: author_name}
    Should return 362 files.
    """
    index_path_rel = '~/cltk_data/originals/phi5/AUTHTAB.DIR'
    index_path = os.path.expanduser(index_path_rel)
    with open(index_path, 'rb') as f:
        r = f.read()
        index_all = r.decode('latin-1').split('\xff')[1:-21]
        index = [x for x in index_all if x]
        file_author = {}
        for x in index:
            # file name
            pattern_file = re.compile('LAT[\d].{4}')
            m = pattern_file.match(x)
            file_name = m.group()[:-1] + '.TXT'

            # author name
            author_name = pattern_file.split(x)[-1]
            pattern_author = re.compile('&1|&ƒl|l$|&|1$|\x83')
            author_name = pattern_author.sub('', author_name)
            pattern_comma = re.compile('\x80')
            author_name = pattern_comma.sub(', ', author_name)
            file_author[file_name] = author_name

    return file_author
In [4]:
phi5_index = build_phi5_index()

plaintext_dir_rel = '~/cltk_data/latin/text/phi5/plaintext/'
plaintext_dir = os.path.expanduser(plaintext_dir_rel)
phi5_files = [os.path.join(plaintext_dir, x) for x in phi5_index]
In [5]:
from cltk.corpus.utils.formatter import cleanup_tlg_txt, remove_non_ascii

def join_hyphens(str_text):
    return str_text.replace('-\n', '')

def join_lines(str_text):
    return str_text.replace('\n', ' ')

def rm_titles(text):
    pattern = re.compile('\{.+?\}')
    return pattern.sub('', text)

def rm_pointed_brackets(str_text):
    str_text = str_text.replace('<', '')
    return str_text.replace('>', '')

def phi5_plaintext_cleanup(str_text):
    str_text = join_hyphens(str_text)
    str_text = join_lines(str_text)
    str_text = rm_titles(str_text)
    return rm_pointed_brackets(str_text)

def clean_read(file_path):
    with open(file_path) as f:
        r = f.read()
        return phi5_plaintext_cleanup(r)
In [6]:
documents = [clean_read(f) for f in phi5_files]
In [7]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform
tfidf = TfidfVectorizer().fit_transform(documents)
pairwise_similarity = (tfidf * tfidf.T).A
In [8]:
# print arrays, 362 lines x 362 scores each
ps_file_rel = '~/cltk_data/user_data/phi5_tfidf_pairwise_sims.txt'
ps_rel = os.path.expanduser(ps_file_rel)
numpy.savetxt(ps_rel,
              pairwise_similarity,
              fmt='%1.8f')
In [9]:
# now match line numbers with their order in phi5_index
# make list of authors indexed by their order
counter_author = {}
for x, y in enumerate(phi5_index):
    counter_author[x] = phi5_index[y]
In [10]:
# make a dict of author name + list of scores
author_scores = {}
import csv
with open(ps_rel, newline='') as csvfile:
    array_reader = csv.reader(csvfile, delimiter=' ')
    for line, numbers in enumerate(array_reader):
        row_author_name = counter_author[line]
        author_scores[row_author_name] = numbers
In [11]:
final_pairwise_comps = {}
for name, scores in author_scores.items():
    for number, score in enumerate(scores):
        if number in counter_author.keys():
            author_name_score = counter_author[number]
            author_comparison_pair = name + ' v. ' + author_name_score
            final_pairwise_comps[author_comparison_pair] = score
In [12]:
from pprint import pprint

ps_file_final_rel = '~/cltk_data/user_data/phi5_tfidf_pairwise_sims_final.txt'
ps_final = os.path.expanduser(ps_file_final_rel)
with open(ps_final, 'w') as out:
    pprint(final_pairwise_comps, stream=out)