Source code for cltk.tokenizers.line

"""Tokenize lines."""

__author__ = [
    "Patrick J. Burns <patrick@diyclassics.org>",
    "Andrew Deloucas <adeloucas@g.harvard.edu>",
]
__license__ = "MIT License. See LICENSE."


[docs]class LineTokenizer:
    """Tokenize text by line; designed for study of poetry."""

    def __init__(self, language):
        """Lower incoming language name and assemble variables.
        :type language: str
        :param language : Language for sentences tokenization.
        """
        self.language = (
            language.lower()
        )  # Keep in case there winds up being a need for language-specific line tokenization

[docs]    def tokenize(self, untokenized_string: str, include_blanks=False):
        """Tokenize lines by '\n'.

        :type untokenized_string: str
        :param untokenized_string: A string containing one of more sentences.
        :param include_blanks: Boolean; If True, blanks will be preserved by "" in returned list of strings; Default is False.
        :rtype : list of strings
        """

        # load tokenizer
        assert isinstance(
            untokenized_string, str
        ), "Incoming argument must be a string."

        # make list of tokenized sentences
        if include_blanks:
            tokenized_lines = untokenized_string.splitlines()
        else:
            tokenized_lines = [
                line for line in untokenized_string.splitlines() if line != ""
            ]
        return tokenized_lines
Source code for cltk.tokenizers.line

The Classical Language Toolkit

Navigation

Related Topics