feat: extracting audio by given text

41dc26f1 · Anton Kudryavtsev · 5f664fbc · 41dc26f1 · 41dc26f1 · 41dc26f1
Verified Commit 41dc26f1 authored 1 year ago by Anton Kudryavtsev
--- a/api/v1/audio.py
+++ b/api/v1/audio.py
@@ -8,6 +8,9 @@ from pydantic.error_wrappers import ValidationError

 from config import get_config
 from core.plugins.no_mem import get_audio_plugins
+from core import task_system
+from huey.api import Result
+

 from .auth import get_current_active_user
 from .models import (
@@ -17,6 +20,7 @@ from .models import (
    ModelsDataReponse,
    TaskCreateResponse,
    UploadFileResponse,
+    AudioExtractPhrasesRequest,
 )
 from .task_utils import _get_job_result, _get_job_status, create_audio_task

@@ -241,3 +245,53 @@ async def get_response(task_id: UUID) -> AudioProcessingResponse:
            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
            detail="There is no such audio processing task",
        ) from error
+
+
+@router.post(
+    "/extract",
+    response_model=TaskCreateResponse,
+    status_code=200,
+    summary="""The endpoint `/split` extract specified phrases from given audio 
+file using specified given audio model""",
+    responses={
+        200: {"description": "Task was successfully created and scheduled"},
+        404: {
+            "description": "The specified file or model was not found.",
+            "content": {
+                "application/json": {
+                    "example": {
+                        "detail": "No such audio file available",
+                    }
+                }
+            },
+        },
+    },
+)
+async def extract_text_from_audio(
+    request: AudioExtractPhrasesRequest,
+) -> TaskCreateResponse:
+    """
+    Parameters:
+    - **audio_file**: an uuid of file to process
+    - **audio_model**: an audio processing model name (check '_/models_' for available models)
+
+    Responses:
+    - 404, No such audio file available
+    - 404, No such audio model available
+    """
+    audio_plugin_info = get_audio_plugins().get(request.audio_model)
+    audio_file_path = config.storage.audio_dir / str(request.audio_file)
+
+    if audio_plugin_info is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="No such audio model available",
+        )
+
+    if not audio_file_path.exists():
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND, detail="No such audio file available"
+        )
+
+    job: Result = task_system.extact_phrases_from_audio(audio_plugin_info.class_name, audio_file_path.as_posix(), request.phrases)  # type: ignore
+    return TaskCreateResponse(task_id=UUID(job.id))
--- a/api/v1/models.py
+++ b/api/v1/models.py
@@ -112,3 +112,19 @@ class AudioTextComparisonResultsResponse(BaseModel):

 class MultipleTasksStatusResponse(BaseModel):
    data: List[TaskStatusResponse]
+
+
+class AudioExtractPhrasesRequest(BaseModel):
+    audio_file: UUID
+    audio_model: str
+    phrases: List[str]
+
+
+class AudioPhrase(BaseModel):
+    audio_segment: AudioChunk | None
+    found: bool
+    phrase: str
+
+
+class AudioExtractPhrasesResponse(BaseModel):
+    data: List[AudioPhrase]
--- a/core/plugins/base.py
+++ b/core/plugins/base.py
@@ -71,6 +71,16 @@ class AudioToTextComparisonResponse(BaseModel):
    errors: List[TextDiff]


+class AudioPhrase(BaseModel):
+    audio_segment: AudioSegment | None
+    found: bool
+    phrase: str
+
+
+class AudioExtractPhrasesResponse(BaseModel):
+    data: List[AudioPhrase]
+
+
 @runtime_checkable
 class BasePlugin(Protocol):
    """

--- a/core/task_system.py
+++ b/core/task_system.py
 import logging
-from typing import Any, Dict
+from typing import Any, Dict, Tuple, List

 from huey import RedisHuey
 from typing import List
@@ -17,10 +17,14 @@ from core.plugins.base import (
    AudioToImageComparisonResponse,
    AudioToTextComparisonResponse,
    TextDiff,
+    AudioPhrase,
+    AudioExtractPhrasesResponse,
+    AudioProcessingFunction,
 )
 from core.plugins.loader import PluginInfo
 from core.processing.audio_split import split_audio
-from core.processing.text import match_phrases
+from core.processing.text import match_phrases, find_phrases
+

 scheduler = RedisHuey()

@@ -232,3 +236,70 @@ def _get_image_plugins() -> Dict[str, PluginInfo]:
    loaded into the worker image plugins.
    """
    return IMAGE_PLUGINS
+
+
+def _extact_phrases_from_audio(
+    audio_class: str, audio_path: str, phrases: List[str]
+) -> AudioExtractPhrasesResponse:
+    # extract text from audio
+    audio_processing_result = _audio_process(
+        audio_class, AudioProcessingFunction, audio_path
+    )
+    audio_segments = audio_processing_result.segments
+    extracted_phrases = [s.text for s in audio_segments]
+
+    # intermediate results
+    intervals: List[Tuple[float, float] | None] = []
+    audio_chunks: List[AudioSegment | None] = []
+
+    # search each phrase
+    for search_phrase in phrases:
+        segment_indexes = find_phrases(extracted_phrases, search_phrase)
+
+        if len(segment_indexes) == 0:
+            intervals.append(None)
+            audio_chunks.append(None)
+            continue
+
+        # join segments
+        start = audio_segments[segment_indexes[0]].start
+        end = audio_segments[segment_indexes[-1]].end
+
+        joined_segments = audio_segments[segment_indexes[0]]
+        for index in segment_indexes[1:]:
+            joined_segments.text += " " + audio_segments[index].text
+
+        joined_segments.start = start
+        joined_segments.end = end
+
+        intervals.append((start, end))
+        audio_chunks.append(joined_segments)
+
+    # split by non-none intervals
+    non_none_intevals: List[Tuple[float, float]] = list(
+        filter(lambda x: x is not None, intervals)  # type: ignore
+    )
+    files = split_audio(audio_path, non_none_intevals)
+
+    # assign splitted files
+    index = 0
+    for segment in audio_chunks:
+        if segment is not None:
+            segment.file = files[index]
+            index += 1
+
+    data: List[AudioPhrase] = [
+        AudioPhrase(
+            audio_segment=segment, found=segment is not None, phrase=phrases[index]
+        )
+        for index, segment in enumerate(audio_chunks)
+    ]
+
+    return AudioExtractPhrasesResponse(data=data)
+
+
+@scheduler.task()
+def extact_phrases_from_audio(
+    audio_class: str, audio_path: str, phrases: List[str]
+) -> AudioExtractPhrasesResponse:
+    return _extact_phrases_from_audio(audio_class, audio_path, phrases)