from cltk.corpus.greek.tlg_indices import TLG_INDEX
from cltk.corpus.utils.formatter import assemble_tlg_author_filepaths
from cltk.corpus.utils.formatter import tlg_plaintext_cleanup
from cltk.stop.greek.stops import STOPS_LIST
from collections import Counter
from nltk.tokenize.punkt import PunktLanguageVars
from nltk.tokenize import RegexpTokenizer
import numpy
import os
from pprint import pprint
import re
from sklearn.feature_extraction.text import TfidfVectorizer
tlg_files = assemble_tlg_author_filepaths()
'''
for f in tlg_files:
with open(f) as fo:
fr = fo.read()
clean_text = tlg_plaintext_cleanup(fr)
'''
def clean_read(input_file):
"""Output a list."""
with open(input_file) as f:
r = f.read()
return tlg_plaintext_cleanup(r)
def clean_read_gen(input_file):
'''Ouput a generator'''
with open(input_file) as f:
r = f.read()
yield tlg_plaintext_cleanup(r)
#! Can scikit-learn handle a generator? the reg function could be memory-intensive
documents = [clean_read(f) for f in tlg_files]
len(documents)
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer.fit_transform
tfidf = TfidfVectorizer().fit_transform(documents)
pairwise_similarity = (tfidf * tfidf.T).A
# print arrays, 1823 lines x 1823 scores each
ps_file_rel = '~/cltk_data/user_data/tlg_tfidf_pairwise_sims.txt'
ps_file = os.path.expanduser(ps_file_rel)
numpy.savetxt(ps_file,
pairwise_similarity,
fmt='%1.8f')
# now match line numbers with their order in TLG_INDEX
# make list of authors indexed by their order
counter_author = {}
for x, y in enumerate(TLG_INDEX):
counter_author[x] = TLG_INDEX[y]
# make a dict of author name + list of scores
author_scores = {}
import csv
with open(ps_file, newline='') as csvfile:
array_reader = csv.reader(csvfile, delimiter=' ')
for line, numbers in enumerate(array_reader):
row_author_name = counter_author[line]
author_scores[row_author_name] = numbers
final_pairwise_comps = {}
for name, scores in author_scores.items():
for number, score in enumerate(scores):
if number in counter_author.keys():
author_name_score = counter_author[number]
author_comparison_pair = name + ' v. ' + author_name_score
final_pairwise_comps[author_comparison_pair] = score
ps_file_final_rel = '~/cltk_data/user_data/tlg_tfidf_pairwise_sims_final.txt'
ps_final = os.path.expanduser(ps_file_final_rel)
with open(ps_final, 'w') as out:
pprint(final_pairwise_comps, stream=out)