Source code for token_stats



[docs]class token_stats: """ functions to compute token statistics """ # cdef TOKEN_stats* tks # # cdef object result_counts # first cdef a new object and add in __cinit__ then use a def method() to call it # # cdef object result_collocations # # cdef object result_look_up_tbl # # # def __cinit__(self): # # self.tks = new TOKEN_stats() # # self.result_counts = {} # # self.result_collocations = {} # # self.result_look_up_tbl = {} # # # def __dealloc__(self): # # del self.tks
[docs] def path_2vector(self, path_2folder = None, path_2file = None, file_delimiter = "\n"): ''' :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter Example:: tks = token_stats() res = tks.path_2vector(path_2file = '/myfolder/vocab_file.txt') .. note:: the path_2vector method returns the words of a folder or file to a list ( using the file_delimiter to input the data ). Usage: read a vocabulary from a text file '''
# if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef vector[string] result_vec # # result_vec = self.tks.path_2vector(path_2folder, path_2file, file_delimiter) # # return result_vec pass
[docs] def freq_distribution(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n", keep = None): ''' :param x_vector: either None or a string character list :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter :param keep: the number of lines to keep from the output data frame Example:: tks = token_stats() res = tks.freq_distribution(path_2file = '/myfolder/vocab_file.txt', keep = 20) .. note:: This method returns a frequency_distribution in form of a data frame for EITHER a folder, a file OR a character string list. '''
# if x_vector is not None: # # assert isinstance(x_vector, list), 'the x_vector parameter should be of type list' # # if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # if x_vector is None: # # x_vector = [] # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef unordered_map[string, int] result_map # # result_map = self.tks.frequency_distribution(x_vector, path_2folder, path_2file, file_delimiter) # # result_pd = pd.DataFrame.from_dict(result_map, orient='index') # # result_pd.columns = ['freq'] # # result_pd = result_pd.sort(['freq'], ascending=[False]) # # if keep is not None: # # assert isinstance(keep, int), 'the keep parameter should be of type int' # # result_pd = result_pd[0:keep] # # return result_pd pass
[docs] def count_character(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n"): ''' :param x_vector: either None or a string character list :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter Example:: tks = token_stats() res = tks.count_character(path_2file = '/myfolder/vocab_file.txt') .. note:: The count_character method returns the number of characters for each word of the corpus for EITHER a folder, a file OR a character string list. '''
# if x_vector is not None: # # assert isinstance(x_vector, list), 'the x_vector parameter should be of type list' # # if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # if x_vector is None: # # x_vector = [] # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef unordered_map[int, vector[string]] result_counts_tmp # # result_counts_tmp = self.tks.count_characters(x_vector, path_2folder, path_2file, file_delimiter) # # self.result_counts = result_counts_tmp # # return list(result_counts_tmp) # extract keys() in both python 2 and 3 pass
[docs] def print_count_character(self, number = None): ''' :param number: a numeric value. All words with number of characters (see method count_character) equal to the number parameter will be returned. Example:: tks = token_stats() res = tks.count_character(path_2file = '/myfolder/vocab_file.txt') tks.print_count_character(number = 6) .. note:: This method should be called after the 'count_character' method is run. Given the numeric parameter 'number' this method prints all the words with number of characters equal to 'number' '''
# assert isinstance(number, int), 'the number parameter should be of type int' # # assert number in list(self.result_counts), "the specified 'number' is not included in the count_character dictionary. Return the 'count_character()' method to see the list of the available numbers" # extract keys() in both python 2 and 3 using list() # # return self.result_counts[number] pass
[docs] def collocation_words(self, x_vector = None, path_2folder = None, path_2file = None, file_delimiter = "\n", n_gram_delimiter = "_"): ''' :param x_vector: either None or a string character list :param path_2folder: either None or a valid path to a folder ( each file in the folder should include words separated by a delimiter ) :param path_2file: either None or a valid path to a file :param file_delimiter: either None or a character string specifying the file delimiter :param n_gram_delimiter: either None or a character string specifying the n-gram delimiter. Example:: tks = token_stats() res = tks.collocation_words(path_2file = '/myfolder/vocab_file.txt') .. note:: The collocation_words method saves a co-occurence frequency table for n-grams for EITHER a folder, a file OR a character string list. A collocation is defined as a sequence of two or more consecutive words, that has characteristics of a syntactic and semantic unit, and whose exact and unambiguous meaning or connotation cannot be derived directly from the meaning or connotation of its components ( http://nlp.stanford.edu/fsnlp/promo/colloc.pdf, page 172 ). The input to the method should be text n-grams separated by a delimiter (for instance 3- or 4-ngrams ). '''
# if x_vector is not None: # # assert isinstance(x_vector, list), 'the x_vector parameter should be of type list' # # if path_2folder is not None: # # assert isinstance(path_2folder, basestring), 'the path_2folder parameter should be of type string' # # IF UNAME_SYSNAME == "Windows": # # assert path_2folder.split('\\')[-1] == "", "the path_2folder parameter should end in slash" # # ELSE: # # assert path_2folder.split('/')[-1] == "", "the path_2folder parameter should end in slash" # # if path_2file is not None: # # assert isinstance(path_2file, basestring), 'the path_2file parameter should be of type string' # # assert os.path.exists(path_2file), "the path_2file parameter should be a valid path to a file" # # assert isinstance(file_delimiter, basestring), 'the file_delimiter parameter should be of type string' # # assert len(file_delimiter) == 1, 'the file_delimiter should be a single character string' # # assert isinstance(n_gram_delimiter, basestring), 'the n_gram_delimiter parameter should be of type string' # # if x_vector is None: # # x_vector = [] # # if path_2folder is None: # # path_2folder = "" # # if path_2file is None: # # path_2file = "" # # cdef unordered_map[string, unordered_map[string, int]] result_coll # # result_coll = self.tks.collocations_ngrams(x_vector, path_2folder, path_2file, file_delimiter, n_gram_delimiter) # # self.result_collocations = result_coll # # return np.sort(list(result_coll)) # extract keys() in both python 2 and 3 pass
[docs] def print_collocations(self, word = None): ''' :param number: a numeric value. All words with number of characters (see method count_character) equal to the number parameter will be returned. Example:: tks = token_stats() res = tks.collocation_words(path_2file = '/myfolder/vocab_file.txt') tks.print_collocations(word = 'aword') .. note:: This method should be called after the 'collocation_words' method is run. It prints the collocations for a specific 'word' '''
# assert isinstance(word, basestring), 'the word parameter should be of type string' # # assert word in list(self.result_collocations), "the specified 'word' is not included in the collocations dictionary. Return the 'collocation_words()' method to see the list of the available words" # extract keys() in both python 2 and 3 using list() # # tmp_vals = self.result_collocations[word] # # tmp_sum = np.sum(listvalues(tmp_vals)) # # for (k,v) in iteritems(tmp_vals): # # tmp_vals[k] = float(np.round(v / float(tmp_sum), decimals = 3)) # first round then use float to get the correct rounding # # return tmp_vals pass
[docs] def string_dissimilarity_matrix(self, words_vector = None, dice_n_gram = 2, method = 'dice', split_separator = " ", dice_thresh = 1.0, upper = True, diagonal = True, threads = 1): ''' :param words_vector: a string character list :param dice_n_gram a numeric value specifying the n-gram for the dice method of the string_dissimilarity_matrix method :param method: a character string specifying the method to use in the string_dissimilarity_matrix method. One of dice, levenshtein or cosine :param split_separator: a character string specifying the string split separator if method equal cosine in the string_dissimilarity_matrix method. The cosine method uses sentences, so for a sentence : "this_is_a_word_sentence" the split_separator should be "_" :param dice_thresh: a float number to use to threshold the data if method is dice in the string_dissimilarity_matrix method. It takes values between 0.0 and 1.0. The closer the thresh is to 0.0 the more values of the dissimilarity matrix will take the value of 1.0. :param upper: either True or False. If True then both lower and upper parts of the dissimilarity matrix of the string_dissimilarity_matrix method will be shown. Otherwise the upper part will be filled with NA's :param diagonal: either True or False. If True then the diagonal of the dissimilarity matrix of the string_dissimilarity_matrix method will be shown. Otherwise the diagonal will be filled with NA's :param threads: a numeric value specifying the number of cores to use in parallel in the string_dissimilarity_matrix method Example:: tks = token_stats() vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to'] res = tks.string_dissimilarity_matrix( words_vector = vocab_lst, dice_n_gram = 2, method = 'dice') .. note:: The string_dissimilarity_matrix method returns a string-dissimilarity-matrix using either the dice, levenshtein or cosine distance. The input can be a character string list only. In case that the method is dice then the dice-coefficient (similarity) is calculated between two strings for a specific number of character n-grams ( dice_n_gram ). '''
# assert isinstance(words_vector, list), 'the words_vector parameter should be of type list' # # assert isinstance(dice_n_gram, int) and dice_n_gram > 0, 'the dice_n_gram parameter should be of type int and greater than 0' # # assert method in ["dice", "levenshtein", "cosine"], "available methods are 'dice', 'levenshtein' or 'cosine'" # # assert isinstance(split_separator, basestring), 'the split_separator parameter should be of type string' # # assert isinstance(dice_thresh, float) and (dice_thresh <= 1.0 and dice_thresh > 0.0), 'the dice_thresh parameter should be of type float' # # assert isinstance(upper, bool), 'the upper parameter should be of type boolean' # # assert isinstance(diagonal, bool), 'the diagonal parameter should be of type boolean' # # assert isinstance(threads, int) and threads > 0, 'the threads parameter should be of type int and greater than 0' # # cdef vector[vector[double]] dissim_mat # # sorted_vec = list(np.sort(words_vector)) # # dissim_mat = self.tks.dissimilarity_mat(sorted_vec, dice_n_gram, method, split_separator, dice_thresh, upper, diagonal, threads) # # df = pd.DataFrame(dissim_mat, index = sorted_vec, columns = sorted_vec) # # return df pass
[docs] def look_up_table(self, words_vector = None, n_grams = None): ''' :param words_vector: a string character list :param n_grams: a numeric value specifying the n-grams Example:: tks = token_stats() vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to'] res = tks.look_up_table(words_vector = vocab_lst, n_grams = 4) .. note:: The look_up_table returns a look-up-list where the list-names are the n-grams and the list-vectors are the words associated with those n-grams. The input can be a character string list only. '''
# assert isinstance(words_vector, list), 'the words_vector parameter should be of type list' # # assert isinstance(n_grams, int) and n_grams > 0, 'the n_grams parameter should be of type int and greater than 0' # # cdef unordered_map[string, vector[string]] look_up_tmp # # look_up_tmp = self.tks.look_up_tbl(words_vector, n_grams) # # self.result_look_up_tbl = look_up_tmp # # return np.sort(list(look_up_tmp)) # extract keys() in both python 2 and 3 pass
[docs] def print_words_lookup_tbl(self, n_gram = None): ''' :param n_gram: a character string specifying the n-gram Example:: tks = token_stats() vocab_lst = ['the', 'term', 'planet', 'is', 'ancient', 'with', 'ties', 'to'] res = tks.look_up_table(words_vector = vocab_lst, n_grams = 4) tks.print_words_lookup_tbl(n_gram = "_abo") .. note:: This method should be called after the 'look_up_table' method is run. It returns words associated to n-grams in the look-up-table '''
# assert isinstance(n_gram, basestring), 'the n_gram parameter should be of type string' # # assert n_gram in list(self.result_look_up_tbl), "the specified 'n_gram' is not included in the look_up_table dictionary. Return the 'look_up_table()' method to see the list of the available n_grams" # extract keys() in both python 2 and 3 # # return self.result_look_up_tbl[n_gram] pass
if __name__ == '__main__': tks = token_stats() tks.path_2vector() tks.freq_distribution() tks.count_character() tks.print_count_character() tks.collocation_words() tks.print_collocations() tks.string_dissimilarity_matrix() tks.look_up_table() tks.print_words_lookup_tbl()