About

This notebook illustrates the CLTK's sentence tokenizer for the Latin language. First, I offer the code with which I tokenized, and counted, sentences and words in the PHI5. Next, the notebook offers a few simple views of this data, organized globally and for specific genres.

Tokenize and count

This first step tokenizes words and sentences, then returns the ratio of average words per sentence for each author in the PHI5 disk. The following code runs in under 5 minutes (download here). This script generates a file called called phi5_auth_sent_data_v3.txt (download).

"""For computing sentence length data for PHI5 authors."""

import ast
from cltk.tokenize.sentence_tokenizer_latin import tokenize_latin_sentences
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import os
import re


# PHI5 parameters
PHI5_DIR = os.path.join('~/cltk_data', 'compiled', 'phi5')
PHI5_INDEX_REL = os.path.join(PHI5_DIR, 'index_author_works.txt')

# Local write parameters
WRITE_FILE_NAME = 'phi5_auth_sent_data_v3.txt'
WRITE_DIR_LOCATION = '~/Downloads'


class AvgSents(object):
    """Contains methods for computing average sentence length."""

    def open_file(self, abs_path):
        """Open any file with an absolute path."""
        try:
            with open(abs_path) as file:
                read_file = file.read()
            return read_file
        except:
            pass

    def clean_text(self, input_text):
        """Clean text.
        TODO: rm '\x00'
        """
        output_text = re.sub(r'\{\d.+?\}\d|\d+?|\{.+?\}|\.{3}|@|\?|!|`|&|%|\$|\#|\+|-|—|\[|\]|<|>|\\\.|!|\.\s\.|\.\.|p\.\s|,\s,|\\|\*', '', input_text)
        return output_text

    def phi5_author_path(self, author, index_dict):
        """Build absolute path to PHI5 author file."""
        print('Processing author:', author)
        phi5_file = index_dict[author]['phi5_file']
        phi5_file_txt = os.path.join(phi5_file + '.txt')
        phi5_file_rel = os.path.join(PHI5_DIR, phi5_file_txt)
        phi5_file_abs = os.path.expanduser(phi5_file_rel)
        return phi5_file_abs

    def count_tokenize_sents(self, author_read):
        """Tokenize and count sentences in a string."""
        try:
            sents = tokenize_latin_sentences(author_read)
            total_sents = len(sents)
            tally_dict = self.tally_of_sentence_lengths(sents)
        except:
            pass
        print('Total sentences:', total_sents)
        print('Tally of sentence lengths:', tally_dict)
        return total_sents, tally_dict

    def write_dict(self, auth_s_w_dict):
        """Write author sentence word data into dict in a file."""
        write_rel_path = os.path.join(WRITE_DIR_LOCATION, WRITE_FILE_NAME)
        write_abs_path = os.path.expanduser(write_rel_path)
        try:
            with open(write_abs_path, 'w') as file:
                try:
                    file.write(str(auth_s_w_dict))
                except:
                    pass
        except:
            pass

    def count_tokenize_words(self, author_read):
        """Tokenize and count words in a string."""
        tokenizer = RegexpTokenizer('\s+', gaps=True)
        try:
            words = tokenizer.tokenize(author_read)
            return len(words)
        except:
            pass

    def tally_of_sentence_lengths(self, sents_list):
        """Count the total occurrences of each sentence length. I.e.,
        {1: 4, 2: 3, 3: 7, 8:12, ... 19: 400, 20: 379, 21: 433, ... },
        as in: {'number of words in a sentence': 'number of times this number
        of words occurs in an author'}
        """
        tally_list = []
        for sentence in sents_list:
            word_tokenizer = RegexpTokenizer('\s+', gaps=True)
            try:
                sentence_words = word_tokenizer.tokenize(sentence)
                sentence_word_length = len(sentence_words)
                tally_list.append(sentence_word_length)
            except:
                pass
        tally_counter = Counter(tally_list)
        tally_dict = dict(tally_counter)
        return tally_dict


def main():
    """Main function"""

    avg = AvgSents()

    phi5_index_abs = os.path.expanduser(PHI5_INDEX_REL)
    index_read = avg.open_file(phi5_index_abs)
    index_dict = ast.literal_eval(index_read)

    auth_s_w_dict = {}
    for author in index_dict:
        phi5_file_abs = avg.phi5_author_path(author, index_dict)
        author_read = avg.open_file(phi5_file_abs)
        avg.clean_text(author_read)
        try:
            sent_count, tally_dict = avg.count_tokenize_sents(author_read)
        except:
            sent_count = 0
        try:
            word_count = avg.count_tokenize_words(author_read)
        except:
            word_count = 0
        try:
            avg_words_per_sent = word_count / sent_count
        except:
            avg_words_per_sent = 0
        counts = {'sent_count': sent_count,
                  'word_count': word_count,
                  'avg_words_per_sent': avg_words_per_sent,
                  'tally_of_sent_word_lengths': tally_dict}
        auth_s_w_dict[author] = counts
    avg.write_dict(auth_s_w_dict)

if __name__ == "__main__":
    main()

Views

Next, I offer some simple views of the file phi5_auth_sent_data_v3.txt using the data analysis library Pandas.

In [1]:
import ast
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd

#display enough rows for all PHI5 authors
pd.set_option('display.max_rows', 10000)
In [2]:
file_name = 'phi5_auth_sent_data_v3.txt'
dir_location = '~/Downloads'
rel_path = os.path.join(dir_location, file_name)
abs_path = os.path.expanduser(rel_path)
with open(abs_path) as f:
    r = f.read()
d = ast.literal_eval(r)
In [3]:
# generate the Pandas DataFrame
df = pd.DataFrame(d)

#rm tally from these views
df = df.drop('tally_of_sent_word_lengths')

All authors sorted by average words per sentence

Outliers are apparent on the low and high end (fragmentary texts with odd formatting). Filtering out authors with less than, say, 1000 words would return better results. I am leaving these outliers in, however, because they will help me to improve the CLTK's text cleaner and sentence tokenizer.

In [4]:
df.T.sort('avg_words_per_sent', ascending=1)
Out[4]:
avg_words_per_sent sent_count word_count
Carmen de Bello Aegyptiaco 1.51926 1947 2958
Publius Cornelius Dolabella 2 2 4
Lucius Iulius Caesar 2.111111 9 19
Lucius Marcius Philippus 2.125 8 17
Publius Clodius Pulcher 2.529412 17 43
Caesellius Vindex 2.571429 49 126
Gnaeus Domitius Ahenobarbus 2.6 5 13
Lucius Cincius Alimentus 2.6 5 13
Altercatio Hadr. et Epicteti 2.829431 299 846
Luscius Lanuvinus 2.833333 6 17
Lucius Orbilius Pupillus 2.857143 7 20
Marcus Calidius 3 10 30
Gallus Antipater 3 2 6
Marcus Antonius 3 6 18
Gaius Aquilius Gallus 3 2 6
Fabius Dossennus 3 2 6
Titus Labienus 3.142857 7 22
Gaius Memmius 3.2 5 16
Marcus Iuventius Laterensis 3.333333 33 110
Gnaeus Marcius vates 3.4 5 17
Anonymi Fragmenta de Iure Fisci 3.495614 228 797
Imp. Marcus Ulpius Traianus; Trajan 3.5 2 7
Attius Labeo 3.5 2 7
Lucius Neratius Priscus 3.5 10 35
Lucius Annaeus Cornutus 3.636364 44 160
Vibius Crispus 3.714286 7 26
P. Cornel. Scipio Nasica Ser. 3.772727 22 83
Publius Rutilius Rufus 3.8 20 76
Aemilius Asper 3.864307 678 2620
Hostius 3.904762 21 82
Scaevus Memor 4 2 8
Appius Claudius Caecus 4 7 28
Aprissius (?) 4 2 8
Atilius 4 7 28
Titius, gram. 4 2 8
Cornificius Gallus 4 2 8
Pompilius 4 5 20
Ablabius 4 3 12
Iulius Modestus 4.095238 21 86
Gnaeus Gellius 4.222222 27 114
Sextus Pompeius Festus 4.296461 9917 42608
Antonius Panurgus 4.315789 19 82
Mummius 4.333333 9 39
Sevius Nicanor 4.333333 3 13
Marcus Verrius Flaccus 4.371429 70 306
Gaius Erucius 4.380952 21 92
Lucius Cornelius Sisenna 4.485119 336 1507
Vagellius 4.5 4 18
Lucius Coelius Antipater 4.505495 91 410
Granius Licinianus 4.517206 988 4463
Lucius Ateius Praetextatus 4.571429 7 32
Q. Pompeius Q.f.Q.n. Rufus 4.666667 3 14
Arbonius Silo 4.666667 3 14
Marullus 4.666667 3 14
Publius Saturius 4.7 10 47
Cornificius Longus 4.888889 9 44
Passienus Crispus 5 2 10
Marcus Aurelius 5 1 5
Valerius, comoed. 5 2 10
Clodius Tuscus 5 2 10
Quintus Hortensius Hortalus 5 7 35
Gaius Scribonius Curio pater 5 3 15
Volumnius 5 1 5
Sextus Turpilius 5.003356 298 1491
Caecilius Statius 5.071066 394 1998
Lucius Arruntius 5.125 8 41
Titinius 5.128889 225 1154
Commentarii Consulares 5.166667 12 62
Marcus Aemilius Scaurus 5.238095 21 110
Titus Maccius Plautus 5.270408 34496 181808
Novius, comoed. 5.294872 156 826
Quintus Valerius Soranus 5.333333 6 32
M. Aemilius Lepidus Porcina 5.333333 3 16
Aulus Cascellius 5.333333 3 16
Gaius Scribonius Curio avus 5.333333 3 16
Marcus Porcius Cato Uticensis 5.357143 14 75
Titus Quinctius Atta 5.366667 30 161
Gaius Calpurnius Piso 5.470588 17 93
Gaius Cassius Hemina 5.471698 53 290
L. Aelius Praeconinus Stilo 5.5 32 176
Iuventius, comoed. 5.5 10 55
Q. Lutatius Catulus iunior 5.5 2 11
Gaius Clodius Licinus 5.5 4 22
Lucius Afranius 5.503093 485 2669
Marcus Antistius Labeo 5.62963 54 304
Sextus (vel Spurius) Ennius 5.636364 11 62
Naevius, iunior 5.666667 3 17
M. Valerius Messalla Corvinus 5.75 16 92
Marcus Porcius Cato M.f.M.n. 5.75 8 46
Anonymi Comici et Tragici 5.876744 430 2527
Cn. Cornel. Lentulus Marcell. 5.909091 11 65
Iulius Africanus 6 2 12
Gaius Papirius Carbo Arvina 6 5 30
Staberius Eros 6 2 12
Sinnius Capito 6.076923 13 79
Gaius Iulius Caesar Strabo 6.083333 12 73
Valerius Antias 6.1 20 122
Granius Flaccus 6.117647 17 104
Quintus Lutatius Catulus 6.166667 12 74
Aulus Furius Antias 6.166667 6 37
Lucius Cassius Longinus 6.2 5 31
Publius Cominius 6.2 5 31
Numitorius 6.25 4 25
Turranius Niger 6.333333 3 19
Saserna 6.333333 6 38
Marcus Pacuvius 6.362187 439 2793
Lucius Pomponius Bononiensis 6.364444 225 1432
Quintus Claudius Quadrigarius 6.39 200 1278
Publius Alfenus Varus 6.444444 9 58
Q. Mucius Scaevola [pontifex] 6.458333 24 155
Publius Aufidius Namusa 6.5 2 13
Hilarius Arelatensis 6.5 4 26
Turnus 6.5 2 13
Titus Annius Luscus 6.5 2 13
Aulus Postumius Albinus 6.5 2 13
Publilius Syrus 6.50545 734 4775
Gaius Servilius Glaucia 6.6 5 33
Cn. Arulenus Caelius Sabinus 6.611111 18 119
Marcus Antonius triumvir 6.615385 13 86
Hadrianus 6.692308 13 87
Gaius Ateius Capito 6.717647 85 571
Quintus Servilius Caepio 6.75 4 27
Lucius Quinctius 6.8 5 34
Gaius Trebatius Testa 6.8 20 136
Gnaeus Naevius 6.969811 265 1847
Q. Fabius Maximus Servilianus 7 3 21
Quintus Mucius Scaevola 7 1 7
Papinius, epigram. 7 4 28
Publius Mucius Scaevola 7 2 14
Anonymi de Differentiis [Fronto] 7.066038 424 2996
Trabea 7.142857 7 50
Aurelius Opillus 7.181818 11 79
Lucius Accius 7.181944 720 5171
Flavius Caper 7.189931 437 3142
Lucius Livius Andronicus 7.195402 87 626
Marcus Iunius Brutus [tyr.] 7.2 5 36
Quintus Aelius Tubero 7.285714 14 102
Sextus Pomponius 7.333333 3 22
Sextus Paconianus 7.333333 3 22
Publius Terentius Afer; Terence 7.350471 7430 54614
Marcus Iunius Gracchanus 7.421053 19 141
Annianus 7.428571 7 52
Gaius Asinius Gallus 7.454545 11 82
Lucilius iunior 7.5 4 30
Sacra Argeorum 7.5625 16 121
Servius Sulpicius Rufus 7.75 36 279
Marcus Iunius Brutus [iur.] 7.75 4 31
Sextilius Ena 8 1 8
Marcus Duronius 8 6 48
Gaius Papirius Carbo 8 3 24
Dorcatius 8 2 16
Cornelius Epicadus 8 1 8
Lucius Verginius Rufus 8 2 16
Marcus Valerius Probus 8.077121 389 3142
Gaius Aelius Gallus 8.118644 59 479
Santra 8.125 8 65
Caelius Apicius 8.128217 2098 17053
Cloatius Verus 8.2 25 205
Sempronius Asellio 8.210526 38 312
Veranius 8.307692 26 216
Lucius Licinius Crassus 8.405405 74 622
Gaius Fannius 8.4375 16 135
Gaius Aurelius Cotta 8.5 2 17
Licinius Imbrex 8.5 2 17
P. Cornel. Scipio Afr. ma. 8.5 6 51
Volcacius Sedigitus 8.533333 15 128
Calpurnius Flaccus 8.637592 814 7031
Gannius 8.666667 3 26
Aufustius 8.666667 3 26
Gracchus, trag. 8.666667 3 26
Publilius Optatianus Porfyrius 8.75 4 35
Gavius Bassus 8.772727 22 193
Gaius Memmius L. f. 8.833333 6 53
Anonymi Grammatici 8.857143 14 124
Gnaeus Matius 8.866667 15 133
Ninnius Crassus 9 2 18
Pupius (?) 9 2 18
Gnaeus Tremelius Scrofa 9 2 18
Caecilius Metellus 9 1 9
Servius Clodius 9.25 4 37
Fenestella 9.266667 15 139
Pseudo-Varro 9.302198 182 1693
Publius Nigidius Figulus 9.318182 88 820
Lucius Annaeus Seneca senior 9.321086 10751 100211
Gaius Licinius Mucianus 9.333333 3 28
Carmen Arvale 9.555556 9 86
Marcus Porcius Cato; Cato 9.556937 2494 23835
Commentarii Augurum 9.6 5 48
Gaius Licinius Macer 9.625 8 77
Gaius Iulius Hyginus 9.677419 62 600
Laelius Felix 9.857143 14 138
Lucius Cornelius Sulla 9.857143 7 69
Precatio Terrae 9.869565 23 227
Aurelius Augustinus 10 3 30
Quintus Cornificius 10 2 20
Cn. Cornel. Lentulus Gaetulicus 10 2 20
Aulus Caecina 10 5 50
Gaius vel Lucius Caepasius 10 2 20
Phaedrus 10.05385 1170 11763
Q. Aurelius Memmius Symmachus 10.13333 15 152
Didascaliae et Argum. in Plautum 10.14189 148 1501
Publius Rutilius Lupus 10.15741 432 4388
Gaius Titius 10.33333 12 124
Didascaliae et Per. in Terentium 10.6 75 795
Gaius Asinius Pollio 10.67647 34 363
Lucius Calpurnius Piso Frugi 10.7619 21 226
Q. Pompeius Q.f.A.n. Rufus 11 1 11
Marianus 11 2 22
Bucolica Einsidlensia 11 56 616
Argum. Aen. et Tetrast. 11.048 125 1381
Fabius Pictor 11.08333 12 133
Pomponius Porphyrio 11.17033 6822 76204
Marcus Valerius Martialis 11.1994 5356 59984
Maurus Servius Honoratus; Servius 11.22091 34933 391980
Sentius Augurinus 11.25 4 45
Gaius Oppius 11.25 8 90
Gaius Laelius Sapiens 11.44444 9 103
Aquilius, comoed. 11.5 6 69
Publius Cannutius 11.5 2 23
Aulus Persius Flaccus 11.50253 396 4555
Iulius Valerius 11.53333 45 519
Balbus, grom. 11.58333 192 2224
Bellum Hispaniense [Anonymous] 11.58379 543 6290
Lucius Novius 11.66667 3 35
Gaius Cilnius Maecenas 11.7 10 117
Marcus Caelius Rufus 11.71429 14 164
Sueius 11.8 10 118
Porcius Licinus 11.91667 12 143
Fragmenta Bobiensia 11.9375 368 4393
Lucius Herennius Balbus 12.2 5 61
Alfius Avitus 12.25 4 49
Valerius Aedituus 12.33333 6 74
Albinus, poet. 12.33333 3 37
Gaius Sempronius Gracchus 12.44595 74 921
Egnatius 12.5 2 25
Petronius 12.51784 2522 31570
Q. Caecilius Metellus Maced. 12.57143 7 88
Hyginus, myth. 12.5927 2330 29341
Anonymi Epici et Lyrici 12.62245 196 2474
Sulpicia, Caleni uxor 12.65789 38 481
Iulius Montanus 12.66667 3 38
Marcus Tullius Tiro 12.73846 65 828
Gaius Lucilius 13.10167 659 8634
Quintus Serenus (Sammonicus) 13.46679 557 7501
Gaius Licinius Macer Calvus 13.61111 18 245
Chalcidius 13.63636 22 300
Marcus Fabius Quintilianus 13.6429 23867 325615
C. Iul. Caes. Augustus Octavianus 13.79024 410 5654
Sextus Propertius 13.89046 1835 25489
Ticidas 14 1 14
Vita Iuvenalis 14.09091 11 155
Albinovanus Pedo 14.1 10 141
Lucius Cincius 14.14706 34 481
Priapea 14.18952 248 3519
Albius Tibullus 14.23182 880 12524
M. Valerius Messalla Rufus 14.2381 21 299
Laevius 14.25 24 342
Caelius Aurelianus 14.33333 3 43
Commentarius Anquisit. Sergii 14.33333 9 129
Pompeius Trogus 14.39863 291 4190
Iustinianus; Justinian; Digest 14.46071 59719 863579
Lucius Annaeus Seneca iunior 14.50977 25329 367518
Quintus Ennius 14.55993 534 7775
Grattius 14.66529 242 3549
Publius Pomponius Secundus 14.75 4 59
Gaius Plinius Secundus; Pliny 14.8197 27105 401688
Publius Ovidius Naso 14.91512 15151 225979
Cornelius Severus 14.94118 17 254
Marcus Furius Bibaculus 15 13 195
Silius Italicus 15.05161 5096 76703
Terentianus Maurus 15.10484 1240 18730
Precatio Omnium Herbarum 15.125 8 121
Gaius Cornelius Gallus; Gallus 15.2 5 76
Gaius Sallustius Crispus 15.20252 3096 47067
Gaius Valerius Catullus 15.24535 860 13111
L. Aemilius L.f.M.n. Paulus 15.25 4 61
C. Plinius Caecilius Secundus; Pliny 15.43158 5642 87065
Quintus Horatius Flaccus; Horace 15.43709 2869 44289
P. Cornel. Scipio Aem. Afr. 15.46429 28 433
Quintus Asconius Pedianus 15.52377 1073 16657
Decimus Iunius Iuvenalis; Juvenal 15.54467 1623 25229
Titus Calpurnius Siculus 15.55193 337 5241
Cornelia, mater Gracchorum 15.5625 16 249
Gaius Valerius Flaccus 15.60507 2408 37577
Septimius Serenus 15.66667 12 188
Tabulae Censoriae 15.71429 7 110
Tarquitius Priscus 15.75 4 63
Quintus Curtius Rufus 15.7749 4678 73795
Domitius Marsus 15.77778 9 142
P. Terentius Varro Atacinus 15.83333 18 285
Annius Florus 15.88933 1753 27854
Parthenius Presbyter 16 6 96
Lentulus, mimus 16 1 16
Rabirius 16 2 32
Philumenus medicus 16.01157 432 6917
Sabidius 16.25 8 130
Claudius Caesar Germanicus 16.31593 383 6249
Scriptores Historiae Augustae 16.36783 6677 109288
Marcus Annaeus Lucanus 16.37906 3142 51463
Hyginus Gromaticus 16.49563 801 13213
Homerus Latinus 16.61634 404 6713
Marcus Terentius Varro; Varro 16.70142 5506 91958
Publius Vergilius Maro; Virgil; Vergil 16.72767 5027 84090
L. Aurel. Avianius Symmachus 16.83333 12 202
Aemilius Macer 16.85714 7 118
Publius Papinius Statius 16.86148 5667 95554
Siculus Flaccus 16.87603 363 6126
Marcus Cornelius Fronto 16.98284 2564 43544
Cornelius Nepos 17.14506 1689 28958
Gaius Helvius Cinna 17.28571 7 121
Lucius Volusius Maecianus 17.36434 129 2240
Cornelius Tacitus 17.73187 9186 162885
Laus Pisonis 17.75 96 1704
Appendix Vergiliana 17.86169 911 16272
Quintus Remmius Palaemon 18.04278 374 6748
Gaius Caesius Bassus 18.29721 323 5910
Zeno of Verona 18.30261 2224 40705
Marcus Tullius Cicero; Cicero; Tully 18.32248 64661 1184750
Hyginus Astronomus 18.40998 1222 22497
Imperator Nero 18.5 4 74
Aulus Cornelius Celsus 18.74741 5590 104798
Lucius Ampelius 18.80198 404 7596
Aemilius Sura 19 3 57
Sextus Iulius Frontinus 19.24388 1960 37718
Quintus Terentius Scaurus 19.37143 245 4746
Aufidius Bassus 19.6 5 98
Aulus Cremutius Cordus 19.66667 6 118
Aulus Gellius 19.67949 6318 124335
Pomponius Mela 19.83553 833 16523
Velius Longus 19.98201 389 7773
Q. Caecilius Metellus Numid. 20 8 160
Lucius Varius Rufus 20.14286 7 141
Helvius Mancia 20.16667 6 121
Gaius Valgius Rufus 20.2 5 101
L. Iunius Moderatus Columella 20.36453 5824 118603
Titus Livius; Livy 20.84459 25307 527514
Iulius Atherianus 21 3 63
Gaius Suetonius Tranquillus 21.03215 3795 79817
Apuleius Madaurensis 21.06414 5005 105426
Decimus Laberius 21.125 40 845
Valerius Maximus 21.18256 3807 80642
Tullius Laurea 21.33333 3 64
Mimi Poetarum Incertorum 21.4 5 107
Gaius, iur.; Gaius 21.51887 2438 52463
Scribonius Largus 21.54409 1270 27361
Bellum Alexandrinum [Anonymous] 21.5823 486 10489
Gaius Iulius Caesar; Caesar 21.6663 3641 78887
Bruttedius Niger 21.66667 6 130
Marcus Manilius; Manilius 22.10791 1251 27657
Vitruvius 22.24771 2620 58289
Aulus Hirtius 23.04124 291 6705
Paulus Quaestor 24 1 24
Bellum Africum [Anonymous] 24.5056 536 13135
Velleius Paterculus 25.20304 1054 26564
Titus Lucretius Carus 27.2115 1844 50178
Favorinus 27.25 4 109
Manilius, poet. 28 1 28
Carmen Evocationis 28.33333 3 85
Quintus Tullius Cicero 28.83439 157 4527
Carmen Devotionis 30.2 5 151
Manius Manilius 30.5 2 61
Decimus Iunius Silanus 32 4 128

Historians sorted by average words per sentence

In [5]:
authors = list(df.columns.values)

dict_gen = {}
historians = ['Titus Livius; Livy', 'Gaius Suetonius Tranquillus', 'Cornelius Tacitus', 'Valerius Maximus', 'Gaius Iulius Caesar; Caesar', 'Valerius Antias', 'Lucius Coelius Antipater', 'Sempronius Asellio', 'Gaius Asinius Pollio', 'Gavius Bassus', 'Lucius Calpurnius Piso Frugi', 'Marcus Porcius Cato; Cato', 'Lucius Cincius Alimentus', 'Claudius Caesar Germanicus', 'Quintus Claudius Quadrigarius', 'Lucius Herennius Balbus', 'Quintus Curtius Rufus', 'Annius Florus', 'Gnaeus Gellius', 'Granius Licinianus', 'Titus Labienus', 'Gaius Licinius Mucianus', 'Valerius Maximus', 'Quintus Asconius Pedianus', 'Fabius Pictor', 'Pompeius Trogus', 'Gaius Sallustius Crispus', 'Lucius Annaeus Seneca senior', 'Silius Italicus', 'Lucius Cornelius Sisenna', 'Velleius Paterculus']
for author in authors:
    if author in historians:
        dict_gen[author] = d[author]
df_hist = pd.DataFrame(dict_gen)
df_hist = df_hist.drop('tally_of_sent_word_lengths')
df_hist.T.sort('avg_words_per_sent', ascending=1)
Out[5]:
avg_words_per_sent sent_count word_count
Lucius Cincius Alimentus 2.6 5 13
Titus Labienus 3.142857 7 22
Gnaeus Gellius 4.222222 27 114
Lucius Cornelius Sisenna 4.485119 336 1507
Lucius Coelius Antipater 4.505495 91 410
Granius Licinianus 4.517206 988 4463
Valerius Antias 6.1 20 122
Quintus Claudius Quadrigarius 6.39 200 1278
Sempronius Asellio 8.210526 38 312
Gavius Bassus 8.772727 22 193
Lucius Annaeus Seneca senior 9.321086 10751 100211
Gaius Licinius Mucianus 9.333333 3 28
Marcus Porcius Cato; Cato 9.556937 2494 23835
Gaius Asinius Pollio 10.67647 34 363
Lucius Calpurnius Piso Frugi 10.7619 21 226
Fabius Pictor 11.08333 12 133
Lucius Herennius Balbus 12.2 5 61
Pompeius Trogus 14.39863 291 4190
Silius Italicus 15.05161 5096 76703
Gaius Sallustius Crispus 15.20252 3096 47067
Quintus Asconius Pedianus 15.52377 1073 16657
Quintus Curtius Rufus 15.7749 4678 73795
Annius Florus 15.88933 1753 27854
Claudius Caesar Germanicus 16.31593 383 6249
Cornelius Tacitus 17.73187 9186 162885
Titus Livius; Livy 20.84459 25307 527514
Gaius Suetonius Tranquillus 21.03215 3795 79817
Valerius Maximus 21.18256 3807 80642
Gaius Iulius Caesar; Caesar 21.6663 3641 78887
Velleius Paterculus 25.20304 1054 26564

Print to CSV

Saving the dataframe to comma–separated–values may be useful for use in other, non–Python programs (download here).

In [6]:
with open(abs_path) as f:
    r = f.read()
d = ast.literal_eval(r)

for x in d:
    y = d[x]
    tally = d[x]['tally_of_sent_word_lengths']
    z = dict(list(y.items()) + list(tally.items()))
    del z['tally_of_sent_word_lengths']
    d[x] = z

df = pd.DataFrame(d)

df.T.to_csv('phi5_auth_word_sentence_v3.csv')