Merge branch 'style/30-document-algorithms-for-the-customer' into 'main'

Style fixes and text finding functionality See merge request !43

Merge branch 'style/30-document-algorithms-for-the-customer' into 'main'
Style fixes and text finding functionality See merge request !43
5f664fbc · Anton Kudryavtsev · 3d550dde · 810ee7d1 · 5f664fbc · 5f664fbc
Commit 5f664fbc authored 1 year ago by Anton Kudryavtsev
--- a/core/processing/audio_split.py
+++ b/core/processing/audio_split.py
@@ -92,9 +92,11 @@ def split_silence(
    max_interval *= 1000
    if silence_chunks[0][0] != 0:
        audio = AudioSegment.silent(100) + audio
-        for i in range(len(silence_chunks)):
-            silence_chunks[i] = (silence_chunks[i][0] + 100, silence_chunks[i][1] + 100)
-        silence_chunks.insert((0, 100), 0)
+        silence_chunks = [
+            (silence_chunks[i][0] + 100, silence_chunks[i][1] + 100)
+            for i in range(len(silence_chunks))
+        ]
+        silence_chunks.insert(0, (0, 100))
    if silence_chunks[-1][1] != len(audio):
        silence_chunks.append((len(audio), len(audio) + 100))
        audio = audio + AudioSegment.silent(100)

--- a/core/processing/text.py
+++ b/core/processing/text.py
-from typing import List, Tuple
+import itertools
+from math import inf
+from typing import Any, List, Tuple


-def match_words(
-    first_text_str: str, second_text_str: str
-) -> List[Tuple[int, str, str]]:
+def __backtracking_levenshtein(
+    first_text: List[str] | str, second_text: List[str] | str
+) -> Tuple[List[str], int]:
    """
-    Matches two texts and returns the difference via a list of errors
-    (i.e. the changes that need to be made to the first text to obtain the second)
-    :param first_text_str: the text in which we try to find the errors
-    :param second_text_str: the "correct" text
-    :return: a List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
-                          the incorrect phrase,
-                          the correct phrase)]
+    Levenshtein distance algorithm that takes two texts and returns the optimal transitions list and the final distance
+    Treats a list as a collection of words separated by spaces
+    :param first_text: a text to be compared, either as a string or a list of words
+    :param second_text: a text to be compared, either as a string or a list of words
+    :return: Tuple[path that the dynamic programming algorithm found in format of
+                   List["entry from first text or "_" if empty" + "-" + "entry from second text or "_" if empty"],
+                   the levenshtein distance computed]
    """

-    # Split the text so the algorithm compares whole words
-    first_text = first_text_str.split()
-    second_text = second_text_str.split()
+    # If we need to compare words, we need to separate them with spaces in the final answer
+    if isinstance(first_text, list):
+        separator = " "
+    else:
+        separator = ""

-    # This algorithm uses levenshtein distance to determine the most probable matching
+    # Initialize the dynamic programming table
    levenshtein_dp = [
-        [461782368126487236] * (len(second_text) + 1)
-        for i in range(len(first_text) + 1)
+        [inf] * (len(second_text) + 1) for i in range(len(first_text) + 1)
    ]

    # Count the dynamic programming table as per the usual algorithm
@@ -30,92 +33,183 @@ def match_words(
        levenshtein_dp[i][0] = i
    for i in range(1, len(second_text) + 1):
        levenshtein_dp[0][i] = i
-    for i in range(1, len(first_text) + 1):
-        for j in range(1, len(second_text) + 1):
-            levenshtein_dp[i][j] = min(
-                levenshtein_dp[i - 1][j - 1]
-                + (first_text[i - 1] != second_text[j - 1]),
-                levenshtein_dp[i - 1][j] + 1,
-                levenshtein_dp[i][j - 1] + 1,
-            )
+    for i, j in itertools.product(
+        range(1, len(first_text) + 1), range(1, len(second_text) + 1)
+    ):
+        levenshtein_dp[i][j] = min(
+            levenshtein_dp[i - 1][j - 1] + (first_text[i - 1] != second_text[j - 1]),
+            levenshtein_dp[i - 1][j] + 1,
+            levenshtein_dp[i][j - 1] + 1,
+        )

    # Backtracks the result getting the list of matched words
-    word_result: List[str] = list()
+    backtrack_result: List[str] = list()
    current_row = len(first_text)
    current_column = len(second_text)
    while current_row * current_column != 0:
        if first_text[current_row - 1] == second_text[current_column - 1]:
-            word_result.append(first_text[current_row - 1])
+            backtrack_result.append(first_text[current_row - 1])
            current_row -= 1
            current_column -= 1
            continue
+
        optimal = min(
            levenshtein_dp[current_row - 1][current_column - 1],
            levenshtein_dp[current_row - 1][current_column],
            levenshtein_dp[current_row][current_column - 1],
        )
        if optimal == levenshtein_dp[current_row - 1][current_column - 1]:
-            word_result.append(
+            backtrack_result.append(
                first_text[current_row - 1] + "-" + second_text[current_column - 1]
            )
            current_row -= 1
            current_column -= 1
        elif optimal == levenshtein_dp[current_row - 1][current_column]:
-            word_result.append(first_text[current_row - 1] + "-_")
+            backtrack_result.append(first_text[current_row - 1] + "-_")
            current_row -= 1
        elif optimal == levenshtein_dp[current_row][current_column - 1]:
-            word_result.append("_-" + second_text[current_column - 1])
+            backtrack_result.append("_-" + second_text[current_column - 1])
            current_column -= 1

+    # Put all the remaining symbols(words) in the answer
    if current_row != 0:
-        word_result.append(" ".join(first_text[:current_row]) + "-_")
+        backtrack_result.append(separator.join(first_text[:current_row]) + "-_")
    elif current_column != 0:
-        word_result.append("_-" + " ".join(second_text[:current_column]))
+        backtrack_result.append("_-" + separator.join(second_text[:current_column]))
+
+    return backtrack_result, int(levenshtein_dp[-1][-1])
+

-    # joining all separate words so we have a unified answer
+def __joined_levenshtein(
+    first_text: List[str] | str, second_text: List[str] | str
+) -> Tuple[List[List[str] | str], int]:
+    """
+    Joins the answers given by the initial levenshtein algorithm so they resemble errors more
+    Treats a list as a collection of words separated by spaces
+    :param first_text: a text to be compared, either as a string or a list of words
+    :param second_text: a text to be compared, either as a string or a list of words
+    :return: Tuple[the joined result in the format of
+                   List[List[the phrase from the first text to be replaced, the replacement from the second text]
+                        OR
+                        str if both texts are the same at this interval],
+                   the levenshtein distance computed]
+    """
+
+    # Compute the full levenshtein answer
+    backtrack_result, distance = __backtracking_levenshtein(first_text, second_text)
+
+    # If we need to compare words, we need to separate them with spaces in the final answer
+    if isinstance(first_text, str):
+        separator = ""
+    else:
+        separator = " "
+
+    # Create the answer list and initialize it with the first element
    joined_result: List[List[str] | str] = list()

-    for current_str in word_result[::-1]:
-        if not joined_result:
-            if "-" in current_str:
-                joined_result.append(current_str.split("-"))
-            else:
-                joined_result.append(current_str)
+    if "-" in backtrack_result[-1]:
+        joined_result.append(backtrack_result[-1].split("-"))
+    else:
+        joined_result.append(backtrack_result[-1])
+
+    # Add the rest of the errors, keeping track of type to know whether we should start creating a new phrase
+    for current_str in backtrack_result[-2::-1]:
+        if isinstance(joined_result[-1], list) and "-" in current_str:
+            # We are collecting two separate phrases and the current_str is also separate, continue the last phrase
+            joined_result[-1][0] += separator + current_str[: current_str.find("-")]
+            joined_result[-1][1] += separator + current_str[current_str.find("-") + 1 :]
        elif isinstance(joined_result[-1], list):
-            if "-" in current_str:
-                if joined_result[-1][0][-1] != "_":
-                    joined_result[-1][0] += " "
-                joined_result[-1][0] += current_str[: current_str.find("-")]
-
-                if joined_result[-1][1][-1] != "_":
-                    joined_result[-1][1] += " "
-                joined_result[-1][1] += current_str[current_str.find("-") + 1 :]
-            else:
-                joined_result.append(current_str)
+            # We are collecting two separate phrases and the current_str is the same, start a new phrase
+            joined_result.append(current_str)
+        elif "-" in current_str:
+            # We are collecting two same phrases and the current_str is separate, start a new phrase
+            joined_result.append(current_str.split("-"))
        else:
-            if "-" in current_str:
-                joined_result.append(current_str.split("-"))
-            else:
-                joined_result[-1] += " " + current_str
+            # We are collecting two same phrases and the current_str is also the same, continue the last phrase
+            joined_result[-1] += separator + current_str

-    for current_str in joined_result:  # type: ignore
-        if isinstance(current_str, list):
-            current_str[0] = current_str[0].replace("_", "")
-            current_str[1] = current_str[1].replace("_", "")
+    # Replace any symbols that symbolise empty space with empty space
+    for data in joined_result:
+        if not isinstance(data, list):
+            continue
+
+        while "_ " in data[0] or "_ " in data[1]:
+            data[0] = data[0].replace("_ ", "_")
+            data[1] = data[1].replace("_ ", "_")
+        data[0] = data[0].replace("_", "")
+        data[1] = data[1].replace("_", "")
+
+    return joined_result, distance
+
+
+def match(
+    first_text_str: str, second_text_str: str, separate_words: bool
+) -> Tuple[List[Tuple[int, str, str]], int]:
+    """
+    Matches two texts and returns the difference via a list of errors
+    (i.e. the changes that need to be made to the first text to obtain the second)
+    :param first_text_str: the text in which we try to find the errors
+    :param second_text_str: the "correct" text
+    :param separate_words: whether the algorithm matches using whole words or just symbols
+    :return: a Tuple[List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
+                          the incorrect phrase,
+                          the correct phrase)],
+                     the distance between the texts]
+    """
+
+    # Split the text if needed so the algorithm compares whole words
+    if separate_words:
+        first_text: List[str] | str = first_text_str.split()
+        second_text: List[str] | str = second_text_str.split()
+    else:
+        first_text = first_text_str
+        second_text = second_text_str
+
+    # This algorithm uses levenshtein distance to determine the most probable matching
+    joined_result, distance = __joined_levenshtein(first_text, second_text)

    # Calculating the final answer by cross-referencing the joined answers with the initial text
    answer: List[Tuple[int, str, str]] = []
    first_index = 0

-    for current_str in joined_result:  # type: ignore
-        if type(current_str) == list:
-            answer.append((first_index, current_str[0], current_str[1]))
-            first_index += len(current_str[0])
-            if current_str[0] != "":
+    for data in joined_result:
+        if isinstance(data, list):
+            answer.append((first_index, data[0], data[1]))
+            first_index += len(data[0])
+            if separate_words and data[0] != "":
                first_index += 1
        else:
-            first_index += len(current_str) + 1
-    return answer
+            first_index += len(data) + 1
+    return answer, distance
+
+
+def __match_symbols(
+    first_text: str, second_text: str
+) -> Tuple[List[Tuple[int, str, str]], int]:
+    """
+    Interface for match() that matches using symbols
+    Used for symbol matching for finding short phrases in text more reliably
+    :param first_text: the text in which we try to find the errors
+    :param second_text: the "correct" text
+    :return: a Tuple[List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
+                          the incorrect phrase,
+                          the correct phrase)],
+                     the distance between the texts]
+    """
+    return match(first_text, second_text, False)
+
+
+def match_words(first_text: str, second_text: str) -> List[Tuple[int, str, str]]:
+    """
+    Interface for match() that matches using whole words
+    Used for finding errors in most cases
+    :param first_text: the text in which we try to find the errors
+    :param second_text: the "correct" text
+    :return: List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
+                        the incorrect phrase,
+                        the correct phrase)]
+    """
+    return match(first_text, second_text, True)[0]


 def match_phrases(phrases: List[str], text: str) -> List[List[Tuple[int, str, str]]]:
@@ -131,8 +225,8 @@ def match_phrases(phrases: List[str], text: str) -> List[List[Tuple[int, str, st
    """

    # Preparing the texts so that capital letters and non-letter symbols are ignored
-    better_text, text_indices = prep_text(text)
-    better_phrases, phrase_indices = prep_text(" ".join(phrases))
+    better_text, text_indices = __prep_text(text)
+    better_phrases, phrase_indices = __prep_text(" ".join(phrases))

    # Calculating the full answer using levenshtein distance
    full_answer = match_words(better_phrases, better_text)
@@ -161,10 +255,10 @@ def match_phrases(phrases: List[str], text: str) -> List[List[Tuple[int, str, st
    return answers


-def prep_text(text: str) -> Tuple[str, List[int]]:
+def __prep_text(text: str) -> Tuple[str, List[int]]:
    """
-    Prepares the text so it is fully lowercase and does not contain any non-letter symbols
-    It is using inbuilt isalpha() and lower() functions so it should support multiple languages
+    Prepares the text, so it is fully lowercase and does not contain any non-letter symbols
+    It is using inbuilt isalpha() and lower() functions, so it should support multiple languages
    :param text: the text to be prepared
    :return: the changed text and a list of indices that map the changed text to the initial one
    """
@@ -191,6 +285,139 @@ def prep_text(text: str) -> Tuple[str, List[int]]:
    return changed.lower().strip(), indices


+def __find(iterable: List[Any] | str, start_index: int, to_find: Any, step: int) -> int:
+    """
+    Helper function for finding the index of a certain element in a list
+    Works with any initial position and step(whole, non-zero)
+    :param iterable: The list or string to search
+    :param start_index: The initial position for the search
+    :param to_find: The element or symbol to be found
+    :param step: The step with which to search for (can be negative)
+    :return: The index of the first found instance that follows the search parameters
+             returns len(iterable) (or -1 with negative step) if nothing was found
+    """
+
+    # Check for validness of step
+    if type(step) != int or step == 0:
+        raise ValueError("Step of 0 or not int step")
+
+    # Set iteration bound so negative steps work
+    if step < 0:
+        iteration_end = -1
+    else:
+        iteration_end = len(iterable)
+
+    # Iterate to find the element
+    for i in range(start_index, iteration_end, step):
+        if iterable[i] == to_find:
+            return i
+
+    # If it was not found, return the end of iteration
+    return iteration_end
+
+
+def find_phrases(phrases: List[str], to_find: str, margin: float = 1.05) -> List[int]:
+    """
+    Finds a piece of text in a list of phrases and returns the indices of the phrases in which the text appears in
+    :param phrases: a list of phrases
+    :param to_find: the text to be found
+    :param margin: the margin for error, used for bloating the search window if to_find has contractions
+    :return: the indices of the phrases which compose to_find
+    """
+
+    # Prepare text to ignore multiple spaces and non-letter symbols
+    better_text, text_indices = __prep_text(to_find)
+    better_phrases, phrase_indices = __prep_text(" ".join(phrases))
+
+    # Compute the size of a window to compare to the text
+    window = int(len(better_text) * margin)
+
+    # Find the window that best fits the string
+    best_window_max = inf
+    best_window_result = []
+
+    best_end = 0
+    for j in range(window, len(better_phrases) + 1):
+        new_ans, lev_dist = __match_symbols(better_text, better_phrases[j - window : j])
+        if best_window_max > lev_dist:
+            best_window_result = new_ans
+            best_window_max = lev_dist
+            best_end = j
+    best_beg = best_end - window
+
+    # Trim the window to exclude unnecessary symbols (trims using full words)
+    optimal_ans = best_window_result
+
+    first_word = __find(
+        optimal_ans[0][2], len(optimal_ans[0][2]) - len(optimal_ans[0][1]) - 1, " ", -1
+    )
+    best_beg += first_word + 1
+
+    last_word = __find(optimal_ans[-1][2], len(optimal_ans[-1][1]), " ", 1)
+    best_end -= len(optimal_ans[-1][2]) - last_word
+
+    # Transform the indices from prepared text to initial text
+    actual_beg = phrase_indices[best_beg]
+    actual_end = phrase_indices[best_end - 1] + 1
+
+    # Iterate through phrases to compute the final answer
+    answer = []
+
+    current_index = 0
+    for phrase in range(len(phrases)):
+        current_index += len(phrases[phrase]) + 1
+        if (
+            current_index > actual_beg
+            and current_index - len(phrases[phrase]) - 1 <= actual_end
+        ):
+            answer.append(phrase)
+
+    return answer
+
+
+# tests = {
+#     "test_cases": [
+#         {
+#             "phrases": [
+#                 "The headache won't go away. She's taking medicine but even that didn't help.",
+#                 "The monster's throbbing in her head continued.",
+#                 "This happened to her only once before in her life and she realized that only one thing could be happening."
+#             ],
+#             "to_find": "she realized that only one thing could be happening",
+#             "answer": [
+#                 2
+#             ]
+#         },
+#         {
+#             "phrases": [
+#                 "The headache won't go away. She's taking medicine but even that didn't help.",
+#                 "The monster's throbbing in her head continued.",
+#                 "This happened to her only once before in her life and she realized that only one thing could be happening."
+#             ],
+#             "to_find": "she has taken medicine",
+#             "answer": [
+#                 1
+#             ]
+#         },
+#         {
+#             "phrases": [
+#                 "The headache won't go away. She's taking medicine but even that didn't help.",
+#                 "The monster's throbbing in her head continued.",
+#                 "This happened to her only once before in her life and she realized that only one thing could be happening."
+#             ],
+#             "to_find": "throbbing in her head continued this happened to her",
+#             "answer": [
+#                 2,
+#                 3
+#             ]
+#         }
+#     ]
+# }
+#
+# for test in tests["test_cases"]:
+#     print(find_phrases(test["phrases"], test["to_find"]))
+#     print(test["answer"])
+#
 # phrases = [
 #  " The headache won't go away. She's taking medicine but even that didn't help.",
 #  " The monster's throbbing in her head continued.",