In [1]:
from cltk.corpus.greek.tlg_indices import TLG_INDEX
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stop.greek.stops import STOPS_LIST
from collections import Counter
from nltk.tokenize.punkt import PunktLanguageVars
from nltk.tokenize import RegexpTokenizer
import numpy
import os
from pprint import pprint
import re
from sklearn.feature_extraction.text import TfidfVectorizer
In [3]:
tlg_files = assemble_tlg_author_filepaths()

'''
for f in tlg_files:
    with open(f) as fo:
        fr = fo.read()
    clean_text = tlg_plaintext_cleanup(fr)
'''


def clean_read(input_file):
    """Output a list."""
    with open(input_file) as f:
        r = f.read()
        return tlg_plaintext_cleanup(r)

def clean_read_gen(input_file):
    '''Ouput a generator'''
    with open(input_file) as f:
        r = f.read()
        yield tlg_plaintext_cleanup(r)

#! Can scikit-learn handle a generator? the reg function could be memory-intensive
documents = [clean_read(f) for f in tlg_files]
In [4]:
len(documents)
Out[4]:
1823
In [5]:
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform
tfidf = TfidfVectorizer().fit_transform(documents)
pairwise_similarity = (tfidf * tfidf.T).A
In [6]:
# print arrays, 1823 lines x 1823 scores each
ps_file_rel = '~/cltk_data/user_data/tlg_tfidf_pairwise_sims.txt'
ps_file = os.path.expanduser(ps_file_rel)
numpy.savetxt(ps_file,
              pairwise_similarity,
              fmt='%1.8f')
In [7]:
# now match line numbers with their order in TLG_INDEX
# make list of authors indexed by their order
counter_author = {}
for x, y in enumerate(TLG_INDEX):
    counter_author[x] = TLG_INDEX[y]
In [8]:
# make a dict of author name + list of scores
author_scores = {}
import csv
with open(ps_file, newline='') as csvfile:
    array_reader = csv.reader(csvfile, delimiter=' ')
    for line, numbers in enumerate(array_reader):
        row_author_name = counter_author[line]
        author_scores[row_author_name] = numbers
In [9]:
final_pairwise_comps = {}
for name, scores in author_scores.items():
    for number, score in enumerate(scores):
        if number in counter_author.keys():
            author_name_score = counter_author[number]
            author_comparison_pair = name + ' v. ' + author_name_score
            final_pairwise_comps[author_comparison_pair] = score
In [10]:
ps_file_final_rel = '~/cltk_data/user_data/tlg_tfidf_pairwise_sims_final.txt'
ps_final = os.path.expanduser(ps_file_final_rel)
with open(ps_final, 'w') as out:
    pprint(final_pairwise_comps, stream=out)
In [ ]: