Source code for token_stats



[docs]class token_stats:
    
    """
    
    functions to compute token statistics
    
    """
    
    
#    cdef TOKEN_stats* tks
#
#    cdef object result_counts         # first cdef a new object and add in __cinit__ then use a def method() to call it
#    
#    cdef object result_collocations
#    
#    cdef object result_look_up_tbl
#
#
#    def __cinit__(self):
#               
#        self.tks = new TOKEN_stats()
#        
#        self.result_counts = {}
#        
#        self.result_collocations = {}
#        
#        self.result_look_up_tbl = {}
#        
#
#    def __dealloc__(self):
#        
#        del self.tks    
    

        
[docs]    def path_2vector(self, path_2folder = None, path_2file = None, file_delimiter = "\n"):
        
        '''
        
        :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter )
        
        :param path_2file: either None or a valid path to a file

        :param file_delimiter: either None or a character string specifying the file delimiter

        
        Example::
                
                tks = token_stats()
                
                res = tks.path_2vector(path_2file = '/myfolder/vocab_file.txt')
                
        .. note::
            
            the path_2vector method returns the words of a folder or file to a list ( using the file_delimiter to input the data ). Usage: read a vocabulary from a text file
        
        '''

#        if path_2folder is not None:
#            
#            assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string'
#            
#            IF UNAME_SYSNAME == "Windows":
#            
#                assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash"
#            
#            ELSE:
#                
#                assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash"
#
#        if path_2file is not None:
#            
#            assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string'
#            
#            assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file"
#            
#        assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string'
#        
#        assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string'
#        
#        if path_2folder is None:
#            
#            path_2folder = ""
#            
#        if path_2file is None:
#            
#            path_2file = ""
#        
#        cdef vector[string] result_vec
#        
#        result_vec = self.tks.path_2vector(path_2folder, path_2file, file_delimiter)
#        
#        return result_vec

    pass
        
        
        
    
[docs]    def freq_distribution(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n", keep = None):
        
        
        '''
        
        :param x_vector: either None or a string character list       
        
        :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter )
        
        :param path_2file: either None or a valid path to a file

        :param file_delimiter: either None or a character string specifying the file delimiter
        
        :param keep: the number of lines to keep from the output data frame

        
        Example::
                
                tks = token_stats()
                
                res = tks.freq_distribution(path_2file = '/myfolder/vocab_file.txt', keep = 20)
                
        .. note::
            
            This method returns a frequency_distribution in form of a data frame for EITHER a folder, a file OR a character string list.
            
        '''
        
#        if x_vector is not None:
#            
#            assert isinstance(x_vector, list), 'the x_vector parameter should be of type list'
#        
#        if path_2folder is not None:
#            
#            assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string'
#            
#            IF UNAME_SYSNAME == "Windows":
#            
#                assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash"
#            
#            ELSE:
#                
#                assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash"
#
#        if path_2file is not None:
#            
#            assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string'
#            
#            assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file"
#            
#        assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string'
#        
#        assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string'
#        
#        if x_vector is None:
#            
#            x_vector = []
#        
#        if path_2folder is None:
#            
#            path_2folder = ""
#            
#        if path_2file is None:
#            
#            path_2file = ""
#        
#        cdef unordered_map[string, int] result_map
#        
#        result_map = self.tks.frequency_distribution(x_vector, path_2folder, path_2file, file_delimiter)
#        
#        result_pd = pd.DataFrame.from_dict(result_map, orient='index')
#        
#        result_pd.columns = ['freq']
#        
#        result_pd = result_pd.sort(['freq'], ascending=[False])
#        
#        if keep is not None:
#            
#            assert isinstance(keep, int), 'the keep parameter should be of type int'
#        
#            result_pd = result_pd[0:keep]
#        
#        return result_pd
    pass
        
    
    
[docs]    def count_character(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n"):
        
        '''
        
        :param x_vector: either None or a string character list       
        
        :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter )
        
        :param path_2file: either None or a valid path to a file

        :param file_delimiter: either None or a character string specifying the file delimiter

        
        Example::
                
                tks = token_stats()
                
                res = tks.count_character(path_2file = '/myfolder/vocab_file.txt')
                
        .. note::
            
            The count_character method returns the number of characters for each word of the corpus for EITHER a folder, a file OR a character string list.
        
        '''
        
#        if x_vector is not None:
#            
#            assert isinstance(x_vector, list), 'the x_vector parameter should be of type list'
#        
#        if path_2folder is not None:
#            
#            assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string'
#            
#            IF UNAME_SYSNAME == "Windows":
#            
#                assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash"
#            
#            ELSE:
#                
#                assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash"
#
#        if path_2file is not None:
#            
#            assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string'
#            
#            assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file"
#            
#        assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string'
#        
#        assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string'
#        
#        if x_vector is None:
#            
#            x_vector = []        
#        
#        if path_2folder is None:
#            
#            path_2folder = ""
#            
#        if path_2file is None:
#            
#            path_2file = ""
#        
#        cdef unordered_map[int, vector[string]] result_counts_tmp   
#        
#        result_counts_tmp = self.tks.count_characters(x_vector, path_2folder, path_2file, file_delimiter)
#        
#        self.result_counts = result_counts_tmp
#        
#        return list(result_counts_tmp)                         # extract keys() in both python 2 and 3
    pass
        
        
        
[docs]    def print_count_character(self, number = None):
        
        '''

        :param number: a numeric value.  All words with number of characters (see method count_character) equal to the number parameter will be returned.

        
        Example::
                
                tks = token_stats()
                
                res = tks.count_character(path_2file = '/myfolder/vocab_file.txt')
                
                tks.print_count_character(number = 6)
                
        .. note::
            
            This method should be called after the 'count_character' method is run. Given the numeric parameter 'number' this method 
            
            prints all the words with number of characters equal to 'number'        
            
        '''
        
#        assert isinstance(number, int), 'the number parameter should be of type int'
#        
#        assert number in list(self.result_counts), "the specified 'number' is not included in the count_character dictionary. Return the 'count_character()' method to see the list of the available numbers"                   # extract keys() in both python 2 and 3 using list()
#        
#        return self.result_counts[number]
    pass
        
        
        
[docs]    def collocation_words(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n", n_gram_delimiter = "_"):
        
        '''
        
        :param x_vector: either None or a string character list       
        
        :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter )
        
        :param path_2file: either None or a valid path to a file

        :param file_delimiter: either None or a character string specifying the file delimiter
        
        :param n_gram_delimiter: either None or a character string specifying the n-gram delimiter.

        
        Example::
                
                tks = token_stats()
                
                res = tks.collocation_words(path_2file = '/myfolder/vocab_file.txt')
                
        .. note::
            
            The collocation_words method saves a co-occurence frequency table for n-grams for EITHER a folder, a file OR a character string list. 
        
            A collocation is defined as a sequence of two or more consecutive words, that has characteristics of a syntactic and semantic unit, and whose 
            
            exact and unambiguous meaning or connotation cannot be derived directly from the meaning or connotation of its components ( http://nlp.stanford.edu/fsnlp/promo/colloc.pdf, page 172 ). 
        
            The input to the method should be text n-grams separated by a delimiter (for instance 3- or 4-ngrams ).
        
        '''
        
#        if x_vector is not None:
#            
#            assert isinstance(x_vector, list), 'the x_vector parameter should be of type list'
#        
#        if path_2folder is not None:
#            
#            assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string'
#            
#            IF UNAME_SYSNAME == "Windows":
#            
#                assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash"
#            
#            ELSE:
#                
#                assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash"
#
#        if path_2file is not None:
#            
#            assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string'
#            
#            assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file"
#            
#        assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string'
#        
#        assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string'
#        
#        assert isinstance(n_gram_delimiter, basestring), 'the n_gram_delimiter parameter should be of type string'
#        
#        if x_vector is None:
#            
#            x_vector = []
#            
#        if path_2folder is None:
#            
#            path_2folder = ""
#            
#        if path_2file is None:
#            
#            path_2file = ""
#        
#        cdef unordered_map[string, unordered_map[string, int]] result_coll
#        
#        result_coll = self.tks.collocations_ngrams(x_vector, path_2folder, path_2file, file_delimiter, n_gram_delimiter)
#        
#        self.result_collocations = result_coll
#        
#        return np.sort(list(result_coll))                       # extract keys() in both python 2 and 3
    pass
        
        
        
[docs]    def print_collocations(self, word = None):
        
        '''
        
        :param number: a numeric value.  All words with number of characters (see method count_character) equal to the number parameter will be returned.

        
        Example::
                
                tks = token_stats()
                
                res = tks.collocation_words(path_2file = '/myfolder/vocab_file.txt')
                
                tks.print_collocations(word = 'aword')
                
        .. note::
            
            This method should be called after the 'collocation_words' method is run. It prints the collocations for a specific 'word'
        
        '''        
        
        
#        assert isinstance(word, basestring), 'the word parameter should be of type string'
#        
#        assert word in list(self.result_collocations), "the specified 'word' is not included in the collocations dictionary. Return the 'collocation_words()' method to see the list of the available words"           # extract keys() in both python 2 and 3 using list()
#            
#        tmp_vals = self.result_collocations[word]
#        
#        tmp_sum = np.sum(listvalues(tmp_vals))
#        
#        for (k,v) in iteritems(tmp_vals):
#            
#            tmp_vals[k] = float(np.round(v / float(tmp_sum), decimals = 3))                        # first round then use float to get the correct rounding
#        
#        return tmp_vals
    pass
        
        
    
[docs]    def string_dissimilarity_matrix(self, words_vector = None, dice_n_gram = 2, method = 'dice', split_separator = " ", dice_thresh = 1.0, upper = True, diagonal = True, threads = 1):
        
        '''
        
        :param words_vector: a string character list    
        
        :param dice_n_gram a numeric value specifying the n-gram for the dice method of the string_dissimilarity_matrix method
        
        :param method: a character string specifying the method to use in the string_dissimilarity_matrix method. One of dice, levenshtein or cosine
        
        :param split_separator: a character string specifying the string split separator if method equal cosine in the string_dissimilarity_matrix method. The cosine method uses sentences, so for a sentence : "this_is_a_word_sentence" the split_separator should be "_"
       
        :param dice_thresh: a float number to use to threshold the data if method is dice in the string_dissimilarity_matrix method. It takes values between 0.0 and 1.0. The closer the thresh is to 0.0 the more values of the dissimilarity matrix will take the value of 1.0.
       
        :param upper: either True or False. If True then both lower and upper parts of the dissimilarity matrix of the string_dissimilarity_matrix method will be shown. Otherwise the upper part will be filled with NA's
       
        :param diagonal: either True or False. If True then the diagonal of the dissimilarity matrix of the string_dissimilarity_matrix method will be shown. Otherwise the diagonal will be filled with NA's
       
        :param threads: a numeric value specifying the number of cores to use in parallel in the string_dissimilarity_matrix method


        Example::
                
                tks = token_stats()
                
                vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to']                
                
                res = tks.string_dissimilarity_matrix( words_vector = vocab_lst, dice_n_gram = 2, method = 'dice')

                
        .. note::
            
            The string_dissimilarity_matrix method returns a string-dissimilarity-matrix using either the dice, levenshtein or cosine distance. The input can be a character 
        
            string list only. In case that the method is dice then the dice-coefficient (similarity) is calculated between two strings for a specific number of character n-grams ( dice_n_gram ).
        
        '''
        
#        assert isinstance(words_vector, list), 'the words_vector parameter should be of type list'
#        
#        assert isinstance(dice_n_gram, int) and dice_n_gram > 0, 'the dice_n_gram parameter should be of type int and greater than 0'
#        
#        assert method in ["dice", "levenshtein", "cosine"], "available methods are 'dice', 'levenshtein' or 'cosine'"
#        
#        assert isinstance(split_separator, basestring), 'the split_separator parameter should be of type string'
#        
#        assert isinstance(dice_thresh, float) and (dice_thresh <= 1.0 and dice_thresh > 0.0), 'the dice_thresh parameter should be of type float'
#        
#        assert isinstance(upper, bool), 'the upper parameter should be of type boolean'
#        
#        assert isinstance(diagonal, bool), 'the diagonal parameter should be of type boolean'
#        
#        assert isinstance(threads, int) and threads > 0, 'the threads parameter should be of type int and greater than 0'
#        
#        cdef vector[vector[double]] dissim_mat
#        
#        sorted_vec = list(np.sort(words_vector))        
#        
#        dissim_mat = self.tks.dissimilarity_mat(sorted_vec, dice_n_gram, method, split_separator, dice_thresh, upper, diagonal, threads)
#        
#        df = pd.DataFrame(dissim_mat, index = sorted_vec, columns = sorted_vec)        
#        
#        return df
    pass
        
       
       
[docs]    def look_up_table(self, words_vector = None, n_grams = None):
        
        '''
        
        :param words_vector: a string character list 
        
        :param n_grams: a numeric value specifying the n-grams

        
        Example::
                
                tks = token_stats()
                
                vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to']
                
                res = tks.look_up_table(words_vector = vocab_lst, n_grams = 4)
                
        .. note::
            
            The look_up_table returns a look-up-list where the list-names are the n-grams and the list-vectors are the words associated with those n-grams. 
        
            The input can be a character string list only.
        
        '''
        
#        assert isinstance(words_vector, list), 'the words_vector parameter should be of type list'
#        
#        assert isinstance(n_grams, int) and n_grams > 0, 'the n_grams parameter should be of type int and greater than 0'
#        
#        cdef unordered_map[string, vector[string]] look_up_tmp
#        
#        look_up_tmp = self.tks.look_up_tbl(words_vector, n_grams)
#        
#        self.result_look_up_tbl = look_up_tmp
#        
#        return np.sort(list(look_up_tmp))                    # extract keys() in both python 2 and 3
    pass
        
      
      
[docs]    def print_words_lookup_tbl(self, n_gram = None):
        
        '''
        
        :param n_gram: a character string specifying the n-gram

        
        Example::
                
                tks = token_stats()
                
                vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to']
                
                res = tks.look_up_table(words_vector = vocab_lst, n_grams = 4)
                
                tks.print_words_lookup_tbl(n_gram = "_abo")
                
        .. note::
            
            This method should be called after the 'look_up_table' method is run. It returns words associated to n-grams in the look-up-table
        
        '''
        
#        assert isinstance(n_gram, basestring), 'the n_gram parameter should be of type string'
#        
#        assert n_gram in list(self.result_look_up_tbl), "the specified 'n_gram' is not included in the look_up_table dictionary. Return the 'look_up_table()' method to see the list of the available n_grams"          # extract keys() in both python 2 and 3
#            
#        return self.result_look_up_tbl[n_gram]
    pass
        
        
if __name__ == '__main__':
    tks = token_stats()
    tks.path_2vector()
    tks.freq_distribution()
    tks.count_character()
    tks.print_count_character()
    tks.collocation_words()
    tks.print_collocations()
    tks.string_dissimilarity_matrix()
    tks.look_up_table()
    tks.print_words_lookup_tbl()