Source code for cltk.prosody.lat.string_utils

"""Utillity class for processing scansion and text."""

import re
import sys
import unicodedata

__author__ = ["Todd Cook <todd.g.cook@gmail.com>"]
__license__ = "MIT License"

"""Helper methods for processing scansion"""
qu_matcher = re.compile("[qQ][uU]")


[docs]def remove_punctuation_dict() -> dict[int, None]:
    """
    Provide a dictionary for removing punctuation, swallowing spaces.

    :return dict with punctuation from the unicode table

    >>> print("I'm ok! Oh #%&*()[]{}!? Fine!".translate(
    ... remove_punctuation_dict()).lstrip())
    Im ok Oh  Fine
    """
    tmp = dict(
        (i, None)
        for i in range(sys.maxunicode)
        if unicodedata.category(chr(i)).startswith("P")
    )
    return tmp


[docs]def punctuation_for_spaces_dict() -> dict[int, str]:
    """
    Provide a dictionary for removing punctuation, keeping spaces. Essential for scansion
    to keep stress patterns in alignment with original vowel positions in the verse.

    :return dict with punctuation from the unicode table

    >>> print("I'm ok! Oh #%&*()[]{}!? Fine!".translate(
    ... punctuation_for_spaces_dict()).strip())
    I m ok  Oh              Fine
    """
    return dict(
        (i, " ")
        for i in range(sys.maxunicode)
        if unicodedata.category(chr(i)).startswith("P")
    )


[docs]def differences(scansion: str, candidate: str) -> list[int]:
    """
    Given two strings, return a list of index positions where the contents differ.

    :param scansion:
    :param candidate:
    :return:

    >>> differences("abc", "abz")
    [2]
    """
    before = scansion.replace(" ", "")
    after = candidate.replace(" ", "")
    diffs = []
    for idx, tmp in enumerate(before):
        if before[idx] != after[idx]:
            diffs.append(idx)
    return diffs


[docs]def mark_list(line: str) -> list[int]:
    """
    Given a string, return a list of index positions where a character/non blank space exists.

    :param line:
    :return:

    >>> mark_list(" a b c")
    [1, 3, 5]
    """
    marks = []
    for idx, car in enumerate(list(line)):
        if car != " ":
            marks.append(idx)
    return marks


[docs]def space_list(line: str) -> list[int]:
    """
    Given a string, return a list of index positions where a blank space occurs.

    :param line:
    :return:

    >>> space_list("    abc ")
    [0, 1, 2, 3, 7]
    """
    spaces = []
    for idx, car in enumerate(list(line)):
        if car == " ":
            spaces.append(idx)
    return spaces


[docs]def flatten(list_of_lists):
    """
    Given a list of lists, flatten all the items into one list.

    :param list_of_lists:
    :return:

    >>> flatten([ [1, 2, 3], [4, 5, 6]])
    [1, 2, 3, 4, 5, 6]
    """
    return [val for sublist in list_of_lists for val in sublist]


[docs]def to_syllables_with_trailing_spaces(line: str, syllables: list[str]) -> list[str]:
    """
    Given a line of syllables and spaces, and a list of syllables, produce a list of the
    syllables with trailing spaces attached as approriate.

    :param line:
    :param syllables:
    :return:

    >>> to_syllables_with_trailing_spaces(' arma virumque cano ',
    ... ['ar', 'ma', 'vi', 'rum', 'que', 'ca', 'no' ])
    [' ar', 'ma ', 'vi', 'rum', 'que ', 'ca', 'no ']
    """
    syllabs_spaces = []
    idx = 0
    linelen = len(line)
    for position, syl in enumerate(syllables):
        if not syl in line and re.match("w", syl, flags=re.IGNORECASE):
            syl = syl.replace("w", "u").replace("W", "U")
        start = line.index(syl, idx)
        idx = start + len(syl)
        if (
            position == 0 and start > 0
        ):  # line starts with punctuation, substituted w/ spaces
            syl = (start * " ") + syl
        if idx + 1 > len(line):
            syllabs_spaces.append(syl)
            return syllabs_spaces
        nextchar = line[idx]
        if nextchar != " ":
            syllabs_spaces.append(syl)
            continue
        else:
            tmpidx = idx
            while tmpidx < linelen and nextchar == " ":
                syl += " "
                tmpidx += 1
                if tmpidx == linelen:
                    syllabs_spaces.append(syl)
                    return syllabs_spaces
                nextchar = line[tmpidx]
            idx = tmpidx - 1
            syllabs_spaces.append(syl)
    return syllabs_spaces


[docs]def join_syllables_spaces(syllables: list[str], spaces: list[int]) -> str:
    """
    Given a list of syllables, and a list of integers indicating the position of spaces, return
    a string that has a space inserted at the designated points.

    :param syllables:
    :param spaces:
    :return:

    >>> join_syllables_spaces(["won", "to", "tree", "dun"], [3, 6, 11])
    'won to tree dun'
    """
    syllable_line = list("".join(syllables))
    for space in spaces:
        syllable_line.insert(space, " ")
    return "".join(flatten(syllable_line))


[docs]def starts_with_qu(word) -> bool:
    """
    Determine whether or not a word start with the letters Q and U.

    :param word:
    :return:

    >>> starts_with_qu("qui")
    True
    >>> starts_with_qu("Quirites")
    True
    """
    return qu_matcher.search(word) is not None


[docs]def stress_positions(stress: str, scansion: str) -> list[int]:
    """
    Given a stress value and a scansion line, return the index positions of the stresses.

    :param stress:
    :param scansion:
    :return:

    >>> stress_positions("-", "    -  U   U - UU    - U U")
    [0, 3, 6]
    """
    line = scansion.replace(" ", "")
    stresses = []
    for idx, char in enumerate(line):
        if char == stress:
            stresses.append(idx)
    return stresses


[docs]def merge_elisions(elided: list[str]) -> str:
    """
    Given a list of strings with different space swapping elisions applied, merge the elisions,
    taking the most without compounding the omissions.

    :param elided:
    :return:

    >>> merge_elisions([
    ... "ignavae agua multum hiatus", "ignav   agua multum hiatus" ,"ignavae agua mult   hiatus"])
    'ignav   agua mult   hiatus'
    """
    results = list(elided[0])
    for line in elided:
        for idx, car in enumerate(line):
            if car == " ":
                results[idx] = " "
    return "".join(results)


[docs]def move_consonant_right(letters: list[str], positions: list[int]) -> list[str]:
    """
    Given a list of letters, and a list of consonant positions, move the consonant positions to
    the right, merging strings as necessary.

    :param letters:
    :param positions:
    :return:

    >>> move_consonant_right(list("abbra"), [ 2, 3])
    ['a', 'b', '', '', 'bra']
    """
    for pos in positions:
        letters[pos + 1] = letters[pos] + letters[pos + 1]
        letters[pos] = ""
    return letters


[docs]def move_consonant_left(letters: list[str], positions: list[int]) -> list[str]:
    """
    Given a list of letters, and a list of consonant positions, move the consonant positions to
    the left, merging strings as necessary.

    :param letters:
    :param positions:
    :return:

    >>> move_consonant_left(['a', 'b', '', '', 'bra'], [1])
    ['ab', '', '', '', 'bra']
    """
    for pos in positions:
        letters[pos - 1] = letters[pos - 1] + letters[pos]
        letters[pos] = ""
    return letters


[docs]def merge_next(letters: list[str], positions: list[int]) -> list[str]:
    """
    Given a list of letter positions, merge each letter with its next neighbor.

    :param letters:
    :param positions:
    :return:

    >>> merge_next(['a', 'b', 'o', 'v', 'o' ], [0, 2])
    ['ab', '', 'ov', '', 'o']
    >>> # Note: because it operates on the original list passed in, the effect is not cummulative:
    >>> merge_next(['a', 'b', 'o', 'v', 'o' ], [0, 2, 3])
    ['ab', '', 'ov', 'o', '']
    """
    for pos in positions:
        letters[pos] = letters[pos] + letters[pos + 1]
        letters[pos + 1] = ""
    return letters


[docs]def remove_blanks(letters: list[str]):
    """
    Given a list of letters, remove any empty strings.

    :param letters:
    :return:

    >>> remove_blanks(['a', '', 'b', '', 'c'])
    ['a', 'b', 'c']
    """
    cleaned = []
    for letter in letters:
        if letter != "":
            cleaned.append(letter)
    return cleaned


[docs]def split_on(word: str, section: str) -> tuple[str, str]:
    """
    Given a string, split on a section, and return the two sections as a tuple.

    :param word:
    :param section:
    :return:

    >>> split_on('hamrye', 'ham')
    ('ham', 'rye')
    """
    return (
        word[: word.index(section)] + section,
        word[word.index(section) + len(section) :],
    )


[docs]def remove_blank_spaces(syllables: list[str]) -> list[str]:
    """
    Given a list of letters, remove any blank spaces or empty strings.

    :param syllables:
    :return:

    >>> remove_blank_spaces(['', 'a', ' ', 'b', ' ', 'c', ''])
    ['a', 'b', 'c']
    """
    cleaned = []
    for syl in syllables:
        if syl == " " or syl == "":
            pass
        else:
            cleaned.append(syl)
    return cleaned


[docs]def overwrite(
    char_list: list[str], regexp: str, quality: str, offset: int = 0
) -> list[str]:
    """
    Given a list of characters and spaces, a matching regular expression, and a quality or
    character, replace the matching character with a space, overwriting with an offset and
    a multiplier if provided.

    :param char_list:
    :param regexp:
    :param quality:
    :param offset:
    :return:

    >>> overwrite(list("multe igne"), r"e\s[aeiou]", " ")
    ['m', 'u', 'l', 't', ' ', ' ', 'i', 'g', 'n', 'e']
    """
    long_matcher = re.compile(regexp)
    line = "".join(char_list)
    long_positions = long_matcher.finditer(line)
    for match in long_positions:
        (start, end) = match.span()  # pylint: disable=unused-variable
        char_list[start + offset] = quality
    return char_list


[docs]def overwrite_dipthong(char_list: list[str], regexp: str, quality: str) -> list[str]:
    """
    Given a list of characters and spaces, a matching regular expression, and a quality or
    character, replace the matching character with a space, overwriting with an offset and
    a multiplier if provided.

    :param char_list: a list of characters
    :param regexp: a matching regular expression
    :param quality: a quality or character to replace
    :return: a list of characters with the dipthong overwritten

    >>> overwrite_dipthong(list("multae aguae"), r"ae\s[aeou]", " ")
    ['m', 'u', 'l', 't', ' ', ' ', ' ', 'a', 'g', 'u', 'a', 'e']
    """
    long_matcher = re.compile(regexp)
    line = "".join(char_list)
    long_positions = long_matcher.finditer(line)
    for match in long_positions:
        (start, end) = match.span()  # pylint: disable=unused-variable
        char_list[start] = quality
        char_list[start + 1] = quality
    return char_list


[docs]def get_unstresses(stresses: list[int], count: int) -> list[int]:
    """
    Given a list of stressed positions, and count of possible positions, return a list of
    the unstressed positions.

    :param stresses: a list of stressed positions
    :param count: the number of possible positions
    :return: a list of unstressed positions

    >>> get_unstresses([0, 3, 6, 9, 12, 15], 17)
    [1, 2, 4, 5, 7, 8, 10, 11, 13, 14, 16]
    """
    return list(set(range(count)) - set(stresses))
Source code for cltk.prosody.lat.string_utils

The Classical Language Toolkit

Navigation

Related Topics