Interface for the fasttext library

fasttext_interface(
  list_params,
  path_output = "",
  MilliSecs = 100,
  path_input = "",
  remove_previous_file = TRUE,
  print_process_time = FALSE
)

Arguments

list_params

a list of valid parameters

path_output

a character string specifying the file path where the process-logs (or output in generally) should be saved

MilliSecs

an integer specifying the delay in milliseconds when printing the results to the specified path_output

path_input

a character string specifying the path to the input data file

remove_previous_file

a boolean. If TRUE, in case that the path_output is not an empty string (""), then an existing file with the same output name will be removed

print_process_time

a boolean. If TRUE then the processing time of the function will be printed out in the R session

Value

a vector of class character that includes the parameters and file paths used as input to the function

Details

This function allows the user to run the various methods included in the fasttext library from within R

The "output" parameter which exists in the named list (see examples section) and is passed to the "list_params" parameter of the "fasttext_interface()" function, is a file path and not a directory name and will actually return two files (a *.vec* and a *.bin*) to the output directory.

References

https://github.com/facebookresearch/fastText

https://github.com/facebookresearch/fastText/blob/master/docs/supervised-tutorial.md

Examples


if (FALSE) {

library(fastText)


####################################################################################
# If the user intends to run the following examples then he / she must replace     #
# the 'input', 'output', 'path_input', 'path_output', 'model' and 'test_data' file #
# paths depending on where the data are located or should be saved!                #
# ( 'tempdir()' is used here as an example folder )                                #
####################################################################################


# ------------------------------------------------
# print information for the Usage of each function [ parameters ]
# ------------------------------------------------

fastText::printUsage()
fastText::printTestUsage()
fastText::printTestLabelUsage()
fastText::printQuantizeUsage()
fastText::printPrintWordVectorsUsage()
fastText::printPrintSentenceVectorsUsage()
fastText::printPrintNgramsUsage()
fastText::printPredictUsage()
fastText::printNNUsage()
fastText::printDumpUsage()
fastText::printAnalogiesUsage()
fastText::print_parameters(command = "supervised")

# -----------------------------------------------------------------------
# In case that the 'command' is one of 'cbow', 'skipgram' or 'supervised'
# -----------------------------------------------------------------------

list_params = list(command = 'cbow',
                   lr = 0.1,
                   dim = 200,
                   input = file.path(tempdir(), "doc.txt"),
                   output = tempdir(),
                   verbose = 2,
                   thread = 1)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(),"model_logs.txt"),
                         MilliSecs = 100)


# ---------------------
# 'supervised' training
# ---------------------

list_params = list(command = 'supervised',
                    lr = 0.1,
                    dim = 200,
                    input = file.path(tempdir(), "cooking.train"),
                    output = file.path(tempdir(), "model_cooking"),
                    verbose = 2,
                    thread = 1)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), 'logs_supervise.txt'),
                         MilliSecs = 5)

# ---------------------------------------
# In case that the 'command' is 'predict'
# ---------------------------------------

list_params = list(command = 'predict',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   test_data = file.path(tempdir(), 'cooking.valid'),
                   k = 1,
                   th = 0.0)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), 'predict_valid.txt'))


# ------------------------------------
# In case that the 'command' is 'test'  [ k = 5 , means that precision and recall are at 5 ]
# ------------------------------------

list_params = list(command = 'test',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   test_data = file.path(tempdir(), 'cooking.valid'),
                   k = 5,
                   th = 0.0)

res = fasttext_interface(list_params)   # It only prints 'Precision', 'Recall' to the R session


# ------------------------------------------
# In case that the 'command' is 'test-label'   [ k = 5 , means that precision and recall are at 5 ]
# ------------------------------------------

list_params = list(command = 'test-label',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   test_data = file.path(tempdir(), 'cooking.valid'),
                   k = 5,
                   th = 0.0)

res = fasttext_interface(list_params,              # prints also 'Precision', 'Recall' to R session
                         path_output = file.path(tempdir(), "test_valid.txt"))

# -----------------
# quantize function  [ it will take a .bin file and return an .ftz file ]
# -----------------

# the quantize function is currenlty (01/02/2019) single-threaded
# https://github.com/facebookresearch/fastText/issues/353#issuecomment-342501742

list_params = list(command = 'quantize',
                   input = file.path(tempdir(), 'model_cooking.bin'),
                   output = file.path(tempdir(), gsub('.bin', '.ftz', 'model_cooking.bin')))

res = fasttext_interface(list_params)


# -----------------
# quantize function  [ by using the optional parameters 'qnorm' and 'qout' ]
# -----------------

list_params = list(command = 'quantize',
                   input = file.path(tempdir(), 'model_cooking.bin'),
                   output = file.path(tempdir(), gsub('.bin', '.ftz', 'model_cooking.bin')),
                   qnorm = TRUE,
                   qout = TRUE)

res = fasttext_interface(list_params)


# ------------------
# print-word-vectors   [ each line of the 'queries.txt' must be a single word ]
# ------------------

list_params = list(command = 'print-word-vectors',
                   model = file.path(tempdir(), 'model_cooking.bin'))

res = fasttext_interface(list_params,
                         path_input = file.path(tempdir(), 'queries.txt'),
                         path_output = file.path(tempdir(), 'print_vecs_file.txt'))


# ----------------------
# print-sentence-vectors   [ See also the comments in the main.cc file about the input-file ]
# ----------------------

list_params = list(command = 'print-sentence-vectors',
                   model = file.path(tempdir(), 'model_cooking.bin'))

res = fasttext_interface(list_params,
                         path_input = file.path(tempdir(), 'text.txt'),
                         path_output = file.path(tempdir(), 'SENTENCE_VECs.txt'))


# ------------
# print-ngrams       [ print to console or to output-file ]
# ------------

list_params = list(command = 'skipgram', lr = 0.1, dim = 200,
                   input = file.path(tempdir(), "doc.txt"),
                   output = tempdir(), verbose = 2, thread = 1,
                   minn = 2, maxn = 2)

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "ngram_out.txt"),
                         MilliSecs = 5)

list_params = list(command = 'print-ngrams',
                   model = file.path(tempdir(), 'ngram_out.bin'),
                   word = 'word')                           # print n-grams for specific word

res = fasttext_interface(list_params, path_output = "")             # print output to console
res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "NGRAMS.txt"))   # output to file


# -------------
# 'nn' function
# -------------

list_params = list(command = 'nn',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   k = 20,
                   query_word = 'word')          # a 'query_word' is required

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "nn_output.txt"))


# ---------
# analogies   [ in the output file each analogy-triplet-result is separated with a newline ]
# ---------

list_params = list(command = 'analogies',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   k = 5)

res = fasttext_interface(list_params,
                         path_input = file.path(tempdir(), 'analogy_queries.txt'),
                         path_output = file.path(tempdir(), 'analogies_output.txt'))

# -------------
# dump function  [ the 'option' param should be one of 'args', 'dict', 'input' or 'output' ]
# -------------

list_params = list(command = 'dump',
                   model = file.path(tempdir(), 'model_cooking.bin'),
                   option = 'args')

res = fasttext_interface(list_params,
                         path_output = file.path(tempdir(), "DUMP.txt"))

}