Innopolis University DevOps Playground
Skip to content
Snippets Groups Projects
Verified Commit 41dc26f1 authored by Anton Kudryavtsev's avatar Anton Kudryavtsev
Browse files

feat: extracting audio by given text

parent 5f664fbc
No related branches found
No related tags found
1 merge request!46Feature/split audio into segments by given array of phrases
......@@ -8,6 +8,9 @@ from pydantic.error_wrappers import ValidationError
from config import get_config
from core.plugins.no_mem import get_audio_plugins
from core import task_system
from huey.api import Result
from .auth import get_current_active_user
from .models import (
......@@ -17,6 +20,7 @@ from .models import (
ModelsDataReponse,
TaskCreateResponse,
UploadFileResponse,
AudioExtractPhrasesRequest,
)
from .task_utils import _get_job_result, _get_job_status, create_audio_task
......@@ -241,3 +245,53 @@ async def get_response(task_id: UUID) -> AudioProcessingResponse:
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="There is no such audio processing task",
) from error
@router.post(
"/extract",
response_model=TaskCreateResponse,
status_code=200,
summary="""The endpoint `/split` extract specified phrases from given audio
file using specified given audio model""",
responses={
200: {"description": "Task was successfully created and scheduled"},
404: {
"description": "The specified file or model was not found.",
"content": {
"application/json": {
"example": {
"detail": "No such audio file available",
}
}
},
},
},
)
async def extract_text_from_audio(
request: AudioExtractPhrasesRequest,
) -> TaskCreateResponse:
"""
Parameters:
- **audio_file**: an uuid of file to process
- **audio_model**: an audio processing model name (check '_/models_' for available models)
Responses:
- 404, No such audio file available
- 404, No such audio model available
"""
audio_plugin_info = get_audio_plugins().get(request.audio_model)
audio_file_path = config.storage.audio_dir / str(request.audio_file)
if audio_plugin_info is None:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail="No such audio model available",
)
if not audio_file_path.exists():
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND, detail="No such audio file available"
)
job: Result = task_system.extact_phrases_from_audio(audio_plugin_info.class_name, audio_file_path.as_posix(), request.phrases) # type: ignore
return TaskCreateResponse(task_id=UUID(job.id))
......@@ -112,3 +112,19 @@ class AudioTextComparisonResultsResponse(BaseModel):
class MultipleTasksStatusResponse(BaseModel):
data: List[TaskStatusResponse]
class AudioExtractPhrasesRequest(BaseModel):
audio_file: UUID
audio_model: str
phrases: List[str]
class AudioPhrase(BaseModel):
audio_segment: AudioChunk | None
found: bool
phrase: str
class AudioExtractPhrasesResponse(BaseModel):
data: List[AudioPhrase]
......@@ -71,6 +71,16 @@ class AudioToTextComparisonResponse(BaseModel):
errors: List[TextDiff]
class AudioPhrase(BaseModel):
audio_segment: AudioSegment | None
found: bool
phrase: str
class AudioExtractPhrasesResponse(BaseModel):
data: List[AudioPhrase]
@runtime_checkable
class BasePlugin(Protocol):
"""
......
import logging
from typing import Any, Dict
from typing import Any, Dict, Tuple, List
from huey import RedisHuey
from typing import List
......@@ -17,10 +17,14 @@ from core.plugins.base import (
AudioToImageComparisonResponse,
AudioToTextComparisonResponse,
TextDiff,
AudioPhrase,
AudioExtractPhrasesResponse,
AudioProcessingFunction,
)
from core.plugins.loader import PluginInfo
from core.processing.audio_split import split_audio
from core.processing.text import match_phrases
from core.processing.text import match_phrases, find_phrases
scheduler = RedisHuey()
......@@ -232,3 +236,70 @@ def _get_image_plugins() -> Dict[str, PluginInfo]:
loaded into the worker image plugins.
"""
return IMAGE_PLUGINS
def _extact_phrases_from_audio(
audio_class: str, audio_path: str, phrases: List[str]
) -> AudioExtractPhrasesResponse:
# extract text from audio
audio_processing_result = _audio_process(
audio_class, AudioProcessingFunction, audio_path
)
audio_segments = audio_processing_result.segments
extracted_phrases = [s.text for s in audio_segments]
# intermediate results
intervals: List[Tuple[float, float] | None] = []
audio_chunks: List[AudioSegment | None] = []
# search each phrase
for search_phrase in phrases:
segment_indexes = find_phrases(extracted_phrases, search_phrase)
if len(segment_indexes) == 0:
intervals.append(None)
audio_chunks.append(None)
continue
# join segments
start = audio_segments[segment_indexes[0]].start
end = audio_segments[segment_indexes[-1]].end
joined_segments = audio_segments[segment_indexes[0]]
for index in segment_indexes[1:]:
joined_segments.text += " " + audio_segments[index].text
joined_segments.start = start
joined_segments.end = end
intervals.append((start, end))
audio_chunks.append(joined_segments)
# split by non-none intervals
non_none_intevals: List[Tuple[float, float]] = list(
filter(lambda x: x is not None, intervals) # type: ignore
)
files = split_audio(audio_path, non_none_intevals)
# assign splitted files
index = 0
for segment in audio_chunks:
if segment is not None:
segment.file = files[index]
index += 1
data: List[AudioPhrase] = [
AudioPhrase(
audio_segment=segment, found=segment is not None, phrase=phrases[index]
)
for index, segment in enumerate(audio_chunks)
]
return AudioExtractPhrasesResponse(data=data)
@scheduler.task()
def extact_phrases_from_audio(
audio_class: str, audio_path: str, phrases: List[str]
) -> AudioExtractPhrasesResponse:
return _extact_phrases_from_audio(audio_class, audio_path, phrases)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment