Source code for cltk.tokenizers.line

"""Tokenize lines."""

__author__ = [
    "Patrick J. Burns <patrick@diyclassics.org>",
    "Andrew Deloucas <adeloucas@g.harvard.edu>",
]
__license__ = "MIT License. See LICENSE."


[docs]class LineTokenizer: """Tokenize text by line; designed for study of poetry.""" def __init__(self, language): """Lower incoming language name and assemble variables. :type language: str :param language : Language for sentences tokenization. """ self.language = ( language.lower() ) # Keep in case there winds up being a need for language-specific line tokenization
[docs] def tokenize(self, untokenized_string: str, include_blanks=False): """Tokenize lines by '\n'. :type untokenized_string: str :param untokenized_string: A string containing one of more sentences. :param include_blanks: Boolean; If True, blanks will be preserved by "" in returned list of strings; Default is False. :rtype : list of strings """ # load tokenizer assert isinstance( untokenized_string, str ), "Incoming argument must be a string." # make list of tokenized sentences if include_blanks: tokenized_lines = untokenized_string.splitlines() else: tokenized_lines = [ line for line in untokenized_string.splitlines() if line != "" ] return tokenized_lines