Innopolis University DevOps Playground
Skip to content
Snippets Groups Projects
Commit 5f664fbc authored by Anton Kudryavtsev's avatar Anton Kudryavtsev
Browse files

Merge branch 'style/30-document-algorithms-for-the-customer' into 'main'

Style fixes and text finding functionality

See merge request !43
parents 3d550dde 810ee7d1
No related branches found
No related tags found
1 merge request!43Style fixes and text finding functionality
Pipeline #3438 passed with stages
in 6 minutes and 42 seconds
......@@ -92,9 +92,11 @@ def split_silence(
max_interval *= 1000
if silence_chunks[0][0] != 0:
audio = AudioSegment.silent(100) + audio
for i in range(len(silence_chunks)):
silence_chunks[i] = (silence_chunks[i][0] + 100, silence_chunks[i][1] + 100)
silence_chunks.insert((0, 100), 0)
silence_chunks = [
(silence_chunks[i][0] + 100, silence_chunks[i][1] + 100)
for i in range(len(silence_chunks))
]
silence_chunks.insert(0, (0, 100))
if silence_chunks[-1][1] != len(audio):
silence_chunks.append((len(audio), len(audio) + 100))
audio = audio + AudioSegment.silent(100)
......
from typing import List, Tuple
import itertools
from math import inf
from typing import Any, List, Tuple
def match_words(
first_text_str: str, second_text_str: str
) -> List[Tuple[int, str, str]]:
def __backtracking_levenshtein(
first_text: List[str] | str, second_text: List[str] | str
) -> Tuple[List[str], int]:
"""
Matches two texts and returns the difference via a list of errors
(i.e. the changes that need to be made to the first text to obtain the second)
:param first_text_str: the text in which we try to find the errors
:param second_text_str: the "correct" text
:return: a List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
the incorrect phrase,
the correct phrase)]
Levenshtein distance algorithm that takes two texts and returns the optimal transitions list and the final distance
Treats a list as a collection of words separated by spaces
:param first_text: a text to be compared, either as a string or a list of words
:param second_text: a text to be compared, either as a string or a list of words
:return: Tuple[path that the dynamic programming algorithm found in format of
List["entry from first text or "_" if empty" + "-" + "entry from second text or "_" if empty"],
the levenshtein distance computed]
"""
# Split the text so the algorithm compares whole words
first_text = first_text_str.split()
second_text = second_text_str.split()
# If we need to compare words, we need to separate them with spaces in the final answer
if isinstance(first_text, list):
separator = " "
else:
separator = ""
# This algorithm uses levenshtein distance to determine the most probable matching
# Initialize the dynamic programming table
levenshtein_dp = [
[461782368126487236] * (len(second_text) + 1)
for i in range(len(first_text) + 1)
[inf] * (len(second_text) + 1) for i in range(len(first_text) + 1)
]
# Count the dynamic programming table as per the usual algorithm
......@@ -30,92 +33,183 @@ def match_words(
levenshtein_dp[i][0] = i
for i in range(1, len(second_text) + 1):
levenshtein_dp[0][i] = i
for i in range(1, len(first_text) + 1):
for j in range(1, len(second_text) + 1):
levenshtein_dp[i][j] = min(
levenshtein_dp[i - 1][j - 1]
+ (first_text[i - 1] != second_text[j - 1]),
levenshtein_dp[i - 1][j] + 1,
levenshtein_dp[i][j - 1] + 1,
)
for i, j in itertools.product(
range(1, len(first_text) + 1), range(1, len(second_text) + 1)
):
levenshtein_dp[i][j] = min(
levenshtein_dp[i - 1][j - 1] + (first_text[i - 1] != second_text[j - 1]),
levenshtein_dp[i - 1][j] + 1,
levenshtein_dp[i][j - 1] + 1,
)
# Backtracks the result getting the list of matched words
word_result: List[str] = list()
backtrack_result: List[str] = list()
current_row = len(first_text)
current_column = len(second_text)
while current_row * current_column != 0:
if first_text[current_row - 1] == second_text[current_column - 1]:
word_result.append(first_text[current_row - 1])
backtrack_result.append(first_text[current_row - 1])
current_row -= 1
current_column -= 1
continue
optimal = min(
levenshtein_dp[current_row - 1][current_column - 1],
levenshtein_dp[current_row - 1][current_column],
levenshtein_dp[current_row][current_column - 1],
)
if optimal == levenshtein_dp[current_row - 1][current_column - 1]:
word_result.append(
backtrack_result.append(
first_text[current_row - 1] + "-" + second_text[current_column - 1]
)
current_row -= 1
current_column -= 1
elif optimal == levenshtein_dp[current_row - 1][current_column]:
word_result.append(first_text[current_row - 1] + "-_")
backtrack_result.append(first_text[current_row - 1] + "-_")
current_row -= 1
elif optimal == levenshtein_dp[current_row][current_column - 1]:
word_result.append("_-" + second_text[current_column - 1])
backtrack_result.append("_-" + second_text[current_column - 1])
current_column -= 1
# Put all the remaining symbols(words) in the answer
if current_row != 0:
word_result.append(" ".join(first_text[:current_row]) + "-_")
backtrack_result.append(separator.join(first_text[:current_row]) + "-_")
elif current_column != 0:
word_result.append("_-" + " ".join(second_text[:current_column]))
backtrack_result.append("_-" + separator.join(second_text[:current_column]))
return backtrack_result, int(levenshtein_dp[-1][-1])
# joining all separate words so we have a unified answer
def __joined_levenshtein(
first_text: List[str] | str, second_text: List[str] | str
) -> Tuple[List[List[str] | str], int]:
"""
Joins the answers given by the initial levenshtein algorithm so they resemble errors more
Treats a list as a collection of words separated by spaces
:param first_text: a text to be compared, either as a string or a list of words
:param second_text: a text to be compared, either as a string or a list of words
:return: Tuple[the joined result in the format of
List[List[the phrase from the first text to be replaced, the replacement from the second text]
OR
str if both texts are the same at this interval],
the levenshtein distance computed]
"""
# Compute the full levenshtein answer
backtrack_result, distance = __backtracking_levenshtein(first_text, second_text)
# If we need to compare words, we need to separate them with spaces in the final answer
if isinstance(first_text, str):
separator = ""
else:
separator = " "
# Create the answer list and initialize it with the first element
joined_result: List[List[str] | str] = list()
for current_str in word_result[::-1]:
if not joined_result:
if "-" in current_str:
joined_result.append(current_str.split("-"))
else:
joined_result.append(current_str)
if "-" in backtrack_result[-1]:
joined_result.append(backtrack_result[-1].split("-"))
else:
joined_result.append(backtrack_result[-1])
# Add the rest of the errors, keeping track of type to know whether we should start creating a new phrase
for current_str in backtrack_result[-2::-1]:
if isinstance(joined_result[-1], list) and "-" in current_str:
# We are collecting two separate phrases and the current_str is also separate, continue the last phrase
joined_result[-1][0] += separator + current_str[: current_str.find("-")]
joined_result[-1][1] += separator + current_str[current_str.find("-") + 1 :]
elif isinstance(joined_result[-1], list):
if "-" in current_str:
if joined_result[-1][0][-1] != "_":
joined_result[-1][0] += " "
joined_result[-1][0] += current_str[: current_str.find("-")]
if joined_result[-1][1][-1] != "_":
joined_result[-1][1] += " "
joined_result[-1][1] += current_str[current_str.find("-") + 1 :]
else:
joined_result.append(current_str)
# We are collecting two separate phrases and the current_str is the same, start a new phrase
joined_result.append(current_str)
elif "-" in current_str:
# We are collecting two same phrases and the current_str is separate, start a new phrase
joined_result.append(current_str.split("-"))
else:
if "-" in current_str:
joined_result.append(current_str.split("-"))
else:
joined_result[-1] += " " + current_str
# We are collecting two same phrases and the current_str is also the same, continue the last phrase
joined_result[-1] += separator + current_str
for current_str in joined_result: # type: ignore
if isinstance(current_str, list):
current_str[0] = current_str[0].replace("_", "")
current_str[1] = current_str[1].replace("_", "")
# Replace any symbols that symbolise empty space with empty space
for data in joined_result:
if not isinstance(data, list):
continue
while "_ " in data[0] or "_ " in data[1]:
data[0] = data[0].replace("_ ", "_")
data[1] = data[1].replace("_ ", "_")
data[0] = data[0].replace("_", "")
data[1] = data[1].replace("_", "")
return joined_result, distance
def match(
first_text_str: str, second_text_str: str, separate_words: bool
) -> Tuple[List[Tuple[int, str, str]], int]:
"""
Matches two texts and returns the difference via a list of errors
(i.e. the changes that need to be made to the first text to obtain the second)
:param first_text_str: the text in which we try to find the errors
:param second_text_str: the "correct" text
:param separate_words: whether the algorithm matches using whole words or just symbols
:return: a Tuple[List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
the incorrect phrase,
the correct phrase)],
the distance between the texts]
"""
# Split the text if needed so the algorithm compares whole words
if separate_words:
first_text: List[str] | str = first_text_str.split()
second_text: List[str] | str = second_text_str.split()
else:
first_text = first_text_str
second_text = second_text_str
# This algorithm uses levenshtein distance to determine the most probable matching
joined_result, distance = __joined_levenshtein(first_text, second_text)
# Calculating the final answer by cross-referencing the joined answers with the initial text
answer: List[Tuple[int, str, str]] = []
first_index = 0
for current_str in joined_result: # type: ignore
if type(current_str) == list:
answer.append((first_index, current_str[0], current_str[1]))
first_index += len(current_str[0])
if current_str[0] != "":
for data in joined_result:
if isinstance(data, list):
answer.append((first_index, data[0], data[1]))
first_index += len(data[0])
if separate_words and data[0] != "":
first_index += 1
else:
first_index += len(current_str) + 1
return answer
first_index += len(data) + 1
return answer, distance
def __match_symbols(
first_text: str, second_text: str
) -> Tuple[List[Tuple[int, str, str]], int]:
"""
Interface for match() that matches using symbols
Used for symbol matching for finding short phrases in text more reliably
:param first_text: the text in which we try to find the errors
:param second_text: the "correct" text
:return: a Tuple[List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
the incorrect phrase,
the correct phrase)],
the distance between the texts]
"""
return match(first_text, second_text, False)
def match_words(first_text: str, second_text: str) -> List[Tuple[int, str, str]]:
"""
Interface for match() that matches using whole words
Used for finding errors in most cases
:param first_text: the text in which we try to find the errors
:param second_text: the "correct" text
:return: List[Tuple(the index at which the error occurs (the beginning of the phrase to replace),
the incorrect phrase,
the correct phrase)]
"""
return match(first_text, second_text, True)[0]
def match_phrases(phrases: List[str], text: str) -> List[List[Tuple[int, str, str]]]:
......@@ -131,8 +225,8 @@ def match_phrases(phrases: List[str], text: str) -> List[List[Tuple[int, str, st
"""
# Preparing the texts so that capital letters and non-letter symbols are ignored
better_text, text_indices = prep_text(text)
better_phrases, phrase_indices = prep_text(" ".join(phrases))
better_text, text_indices = __prep_text(text)
better_phrases, phrase_indices = __prep_text(" ".join(phrases))
# Calculating the full answer using levenshtein distance
full_answer = match_words(better_phrases, better_text)
......@@ -161,10 +255,10 @@ def match_phrases(phrases: List[str], text: str) -> List[List[Tuple[int, str, st
return answers
def prep_text(text: str) -> Tuple[str, List[int]]:
def __prep_text(text: str) -> Tuple[str, List[int]]:
"""
Prepares the text so it is fully lowercase and does not contain any non-letter symbols
It is using inbuilt isalpha() and lower() functions so it should support multiple languages
Prepares the text, so it is fully lowercase and does not contain any non-letter symbols
It is using inbuilt isalpha() and lower() functions, so it should support multiple languages
:param text: the text to be prepared
:return: the changed text and a list of indices that map the changed text to the initial one
"""
......@@ -191,6 +285,139 @@ def prep_text(text: str) -> Tuple[str, List[int]]:
return changed.lower().strip(), indices
def __find(iterable: List[Any] | str, start_index: int, to_find: Any, step: int) -> int:
"""
Helper function for finding the index of a certain element in a list
Works with any initial position and step(whole, non-zero)
:param iterable: The list or string to search
:param start_index: The initial position for the search
:param to_find: The element or symbol to be found
:param step: The step with which to search for (can be negative)
:return: The index of the first found instance that follows the search parameters
returns len(iterable) (or -1 with negative step) if nothing was found
"""
# Check for validness of step
if type(step) != int or step == 0:
raise ValueError("Step of 0 or not int step")
# Set iteration bound so negative steps work
if step < 0:
iteration_end = -1
else:
iteration_end = len(iterable)
# Iterate to find the element
for i in range(start_index, iteration_end, step):
if iterable[i] == to_find:
return i
# If it was not found, return the end of iteration
return iteration_end
def find_phrases(phrases: List[str], to_find: str, margin: float = 1.05) -> List[int]:
"""
Finds a piece of text in a list of phrases and returns the indices of the phrases in which the text appears in
:param phrases: a list of phrases
:param to_find: the text to be found
:param margin: the margin for error, used for bloating the search window if to_find has contractions
:return: the indices of the phrases which compose to_find
"""
# Prepare text to ignore multiple spaces and non-letter symbols
better_text, text_indices = __prep_text(to_find)
better_phrases, phrase_indices = __prep_text(" ".join(phrases))
# Compute the size of a window to compare to the text
window = int(len(better_text) * margin)
# Find the window that best fits the string
best_window_max = inf
best_window_result = []
best_end = 0
for j in range(window, len(better_phrases) + 1):
new_ans, lev_dist = __match_symbols(better_text, better_phrases[j - window : j])
if best_window_max > lev_dist:
best_window_result = new_ans
best_window_max = lev_dist
best_end = j
best_beg = best_end - window
# Trim the window to exclude unnecessary symbols (trims using full words)
optimal_ans = best_window_result
first_word = __find(
optimal_ans[0][2], len(optimal_ans[0][2]) - len(optimal_ans[0][1]) - 1, " ", -1
)
best_beg += first_word + 1
last_word = __find(optimal_ans[-1][2], len(optimal_ans[-1][1]), " ", 1)
best_end -= len(optimal_ans[-1][2]) - last_word
# Transform the indices from prepared text to initial text
actual_beg = phrase_indices[best_beg]
actual_end = phrase_indices[best_end - 1] + 1
# Iterate through phrases to compute the final answer
answer = []
current_index = 0
for phrase in range(len(phrases)):
current_index += len(phrases[phrase]) + 1
if (
current_index > actual_beg
and current_index - len(phrases[phrase]) - 1 <= actual_end
):
answer.append(phrase)
return answer
# tests = {
# "test_cases": [
# {
# "phrases": [
# "The headache won't go away. She's taking medicine but even that didn't help.",
# "The monster's throbbing in her head continued.",
# "This happened to her only once before in her life and she realized that only one thing could be happening."
# ],
# "to_find": "she realized that only one thing could be happening",
# "answer": [
# 2
# ]
# },
# {
# "phrases": [
# "The headache won't go away. She's taking medicine but even that didn't help.",
# "The monster's throbbing in her head continued.",
# "This happened to her only once before in her life and she realized that only one thing could be happening."
# ],
# "to_find": "she has taken medicine",
# "answer": [
# 1
# ]
# },
# {
# "phrases": [
# "The headache won't go away. She's taking medicine but even that didn't help.",
# "The monster's throbbing in her head continued.",
# "This happened to her only once before in her life and she realized that only one thing could be happening."
# ],
# "to_find": "throbbing in her head continued this happened to her",
# "answer": [
# 2,
# 3
# ]
# }
# ]
# }
#
# for test in tests["test_cases"]:
# print(find_phrases(test["phrases"], test["to_find"]))
# print(test["answer"])
#
# phrases = [
# " The headache won't go away. She's taking medicine but even that didn't help.",
# " The monster's throbbing in her head continued.",
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment