Source code for token_stats

[docs]class token_stats: """ functions to compute token statistics """ # cdef TOKEN_stats* tks # # cdef object result_counts # first cdef a new object and add in __cinit__ then use a def method() to call it # # cdef object result_collocations # # cdef object result_look_up_tbl # # # def __cinit__(self): # # self.tks = new TOKEN_stats() # # self.result_counts = {} # # self.result_collocations = {} # # self.result_look_up_tbl = {} # # # def __dealloc__(self): # # del self.tks
[docs] def path_2vector(self, path_2folder = None, path_2file = None, file_delimiter = "\n"): ''' :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter Example:: tks = token_stats() res = tks.path_2vector(path_2file = '/myfolder/vocab_file.txt') .. note:: the path_2vector method returns the words of a folder or file to a list ( using the file_delimiter to input the data ). Usage: read a vocabulary from a text file '''
# if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef vector[string] result_vec # # result_vec = self.tks.path_2vector(path_2folder, path_2file, file_delimiter) # # return result_vec pass
[docs] def freq_distribution(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n", keep = None): ''' :param x_vector: either None or a string character list :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter :param keep: the number of lines to keep from the output data frame Example:: tks = token_stats() res = tks.freq_distribution(path_2file = '/myfolder/vocab_file.txt', keep = 20) .. note:: This method returns a frequency_distribution in form of a data frame for EITHER a folder, a file OR a character string list. '''
# if x_vector is not None: # # assert isinstance(x_vector, list), 'the x_vector parameter should be of type list' # # if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # if x_vector is None: # # x_vector = [] # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef unordered_map[string, int] result_map # # result_map = self.tks.frequency_distribution(x_vector, path_2folder, path_2file, file_delimiter) # # result_pd = pd.DataFrame.from_dict(result_map, orient='index') # # result_pd.columns = ['freq'] # # result_pd = result_pd.sort(['freq'], ascending=[False]) # # if keep is not None: # # assert isinstance(keep, int), 'the keep parameter should be of type int' # # result_pd = result_pd[0:keep] # # return result_pd pass
[docs] def count_character(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n"): ''' :param x_vector: either None or a string character list :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter Example:: tks = token_stats() res = tks.count_character(path_2file = '/myfolder/vocab_file.txt') .. note:: The count_character method returns the number of characters for each word of the corpus for EITHER a folder, a file OR a character string list. '''
# if x_vector is not None: # # assert isinstance(x_vector, list), 'the x_vector parameter should be of type list' # # if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # if x_vector is None: # # x_vector = [] # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef unordered_map[int, vector[string]] result_counts_tmp # # result_counts_tmp = self.tks.count_characters(x_vector, path_2folder, path_2file, file_delimiter) # # self.result_counts = result_counts_tmp # # return list(result_counts_tmp) # extract keys() in both python 2 and 3 pass
[docs] def print_count_character(self, number = None): ''' :param number: a numeric value. All words with number of characters (see method count_character) equal to the number parameter will be returned. Example:: tks = token_stats() res = tks.count_character(path_2file = '/myfolder/vocab_file.txt') tks.print_count_character(number = 6) .. note:: This method should be called after the 'count_character' method is run. Given the numeric parameter 'number' this method prints all the words with number of characters equal to 'number' '''
# assert isinstance(number, int), 'the number parameter should be of type int' # # assert number in list(self.result_counts), "the specified 'number' is not included in the count_character dictionary. Return the 'count_character()' method to see the list of the available numbers" # extract keys() in both python 2 and 3 using list() # # return self.result_counts[number] pass
[docs] def collocation_words(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n", n_gram_delimiter = "_"): ''' :param x_vector: either None or a string character list :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter :param n_gram_delimiter: either None or a character string specifying the n-gram delimiter. Example:: tks = token_stats() res = tks.collocation_words(path_2file = '/myfolder/vocab_file.txt') .. note:: The collocation_words method saves a co-occurence frequency table for n-grams for EITHER a folder, a file OR a character string list. A collocation is defined as a sequence of two or more consecutive words, that has characteristics of a syntactic and semantic unit, and whose exact and unambiguous meaning or connotation cannot be derived directly from the meaning or connotation of its components (, page 172 ). The input to the method should be text n-grams separated by a delimiter (for instance 3- or 4-ngrams ). '''
# if x_vector is not None: # # assert isinstance(x_vector, list), 'the x_vector parameter should be of type list' # # if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # assert isinstance(n_gram_delimiter, basestring), 'the n_gram_delimiter parameter should be of type string' # # if x_vector is None: # # x_vector = [] # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef unordered_map[string, unordered_map[string, int]] result_coll # # result_coll = self.tks.collocations_ngrams(x_vector, path_2folder, path_2file, file_delimiter, n_gram_delimiter) # # self.result_collocations = result_coll # # return np.sort(list(result_coll)) # extract keys() in both python 2 and 3 pass
[docs] def print_collocations(self, word = None): ''' :param number: a numeric value. All words with number of characters (see method count_character) equal to the number parameter will be returned. Example:: tks = token_stats() res = tks.collocation_words(path_2file = '/myfolder/vocab_file.txt') tks.print_collocations(word = 'aword') .. note:: This method should be called after the 'collocation_words' method is run. It prints the collocations for a specific 'word' '''
# assert isinstance(word, basestring), 'the word parameter should be of type string' # # assert word in list(self.result_collocations), "the specified 'word' is not included in the collocations dictionary. Return the 'collocation_words()' method to see the list of the available words" # extract keys() in both python 2 and 3 using list() # # tmp_vals = self.result_collocations[word] # # tmp_sum = np.sum(listvalues(tmp_vals)) # # for (k,v) in iteritems(tmp_vals): # # tmp_vals[k] = float(np.round(v / float(tmp_sum), decimals = 3)) # first round then use float to get the correct rounding # # return tmp_vals pass
[docs] def string_dissimilarity_matrix(self, words_vector = None, dice_n_gram = 2, method = 'dice', split_separator = " ", dice_thresh = 1.0, upper = True, diagonal = True, threads = 1): ''' :param words_vector: a string character list :param dice_n_gram a numeric value specifying the n-gram for the dice method of the string_dissimilarity_matrix method :param method: a character string specifying the method to use in the string_dissimilarity_matrix method. One of dice, levenshtein or cosine :param split_separator: a character string specifying the string split separator if method equal cosine in the string_dissimilarity_matrix method. The cosine method uses sentences, so for a sentence : "this_is_a_word_sentence" the split_separator should be "_" :param dice_thresh: a float number to use to threshold the data if method is dice in the string_dissimilarity_matrix method. It takes values between 0.0 and 1.0. The closer the thresh is to 0.0 the more values of the dissimilarity matrix will take the value of 1.0. :param upper: either True or False. If True then both lower and upper parts of the dissimilarity matrix of the string_dissimilarity_matrix method will be shown. Otherwise the upper part will be filled with NA's :param diagonal: either True or False. If True then the diagonal of the dissimilarity matrix of the string_dissimilarity_matrix method will be shown. Otherwise the diagonal will be filled with NA's :param threads: a numeric value specifying the number of cores to use in parallel in the string_dissimilarity_matrix method Example:: tks = token_stats() vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to'] res = tks.string_dissimilarity_matrix( words_vector = vocab_lst, dice_n_gram = 2, method = 'dice') .. note:: The string_dissimilarity_matrix method returns a string-dissimilarity-matrix using either the dice, levenshtein or cosine distance. The input can be a character string list only. In case that the method is dice then the dice-coefficient (similarity) is calculated between two strings for a specific number of character n-grams ( dice_n_gram ). '''
# assert isinstance(words_vector, list), 'the words_vector parameter should be of type list' # # assert isinstance(dice_n_gram, int) and dice_n_gram > 0, 'the dice_n_gram parameter should be of type int and greater than 0' # # assert method in ["dice", "levenshtein", "cosine"], "available methods are 'dice', 'levenshtein' or 'cosine'" # # assert isinstance(split_separator, basestring), 'the split_separator parameter should be of type string' # # assert isinstance(dice_thresh, float) and (dice_thresh <= 1.0 and dice_thresh > 0.0), 'the dice_thresh parameter should be of type float' # # assert isinstance(upper, bool), 'the upper parameter should be of type boolean' # # assert isinstance(diagonal, bool), 'the diagonal parameter should be of type boolean' # # assert isinstance(threads, int) and threads > 0, 'the threads parameter should be of type int and greater than 0' # # cdef vector[vector[double]] dissim_mat # # sorted_vec = list(np.sort(words_vector)) # # dissim_mat = self.tks.dissimilarity_mat(sorted_vec, dice_n_gram, method, split_separator, dice_thresh, upper, diagonal, threads) # # df = pd.DataFrame(dissim_mat, index = sorted_vec, columns = sorted_vec) # # return df pass
[docs] def look_up_table(self, words_vector = None, n_grams = None): ''' :param words_vector: a string character list :param n_grams: a numeric value specifying the n-grams Example:: tks = token_stats() vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to'] res = tks.look_up_table(words_vector = vocab_lst, n_grams = 4) .. note:: The look_up_table returns a look-up-list where the list-names are the n-grams and the list-vectors are the words associated with those n-grams. The input can be a character string list only. '''
# assert isinstance(words_vector, list), 'the words_vector parameter should be of type list' # # assert isinstance(n_grams, int) and n_grams > 0, 'the n_grams parameter should be of type int and greater than 0' # # cdef unordered_map[string, vector[string]] look_up_tmp # # look_up_tmp = self.tks.look_up_tbl(words_vector, n_grams) # # self.result_look_up_tbl = look_up_tmp # # return np.sort(list(look_up_tmp)) # extract keys() in both python 2 and 3 pass
[docs] def print_words_lookup_tbl(self, n_gram = None): ''' :param n_gram: a character string specifying the n-gram Example:: tks = token_stats() vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to'] res = tks.look_up_table(words_vector = vocab_lst, n_grams = 4) tks.print_words_lookup_tbl(n_gram = "_abo") .. note:: This method should be called after the 'look_up_table' method is run. It returns words associated to n-grams in the look-up-table '''
# assert isinstance(n_gram, basestring), 'the n_gram parameter should be of type string' # # assert n_gram in list(self.result_look_up_tbl), "the specified 'n_gram' is not included in the look_up_table dictionary. Return the 'look_up_table()' method to see the list of the available n_grams" # extract keys() in both python 2 and 3 # # return self.result_look_up_tbl[n_gram] pass
if __name__ == '__main__': tks = token_stats() tks.path_2vector() tks.freq_distribution() tks.count_character() tks.print_count_character() tks.collocation_words() tks.print_collocations() tks.string_dissimilarity_matrix() tks.look_up_table() tks.print_words_lookup_tbl()