diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 053fc7ac..da56a0a9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -59,20 +59,20 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Set up Python 3.10 + - name: Set up Python 3.10.12 uses: actions/setup-python@v3 with: - python-version: "3.10" + python-version: "3.10.12" - name: Install dependencies run: | python -m pip install --upgrade pip + pip install nemo-toolkit[asr,nlp]==2.7.2 pip install -r requirements/main.txt pip install -r requirements/tests.txt sudo apt-get update sudo apt-get install -y libsndfile1 ffmpeg sox libsox-fmt-mp3 pip install pytorch_lightning pip install Cython wheel # need to pre-install to avoid error in nemo installation - pip install nemo-toolkit[asr,nlp]==2.2.1 pip install nemo_text_processing pip install -r requirements/huggingface.txt pip install pymarian diff --git a/dataset_configs/multilingual/granary/README.md b/dataset_configs/multilingual/granary/README.md index b3c0c474..f63b767b 100644 --- a/dataset_configs/multilingual/granary/README.md +++ b/dataset_configs/multilingual/granary/README.md @@ -63,7 +63,7 @@ pip install fasttext - `ConvertToTarredAudioDataset` (optional, only if tar-sharding is enabled) ```bash -pip install lhotse "nemo-toolkit[common]==2.2.1" +pip install lhotse "nemo-toolkit[common]==2.7.2" ``` ### Quick start diff --git a/dataset_configs/multilingual/granary/config.yaml b/dataset_configs/multilingual/granary/config.yaml index 78e778b0..140c0df6 100644 --- a/dataset_configs/multilingual/granary/config.yaml +++ b/dataset_configs/multilingual/granary/config.yaml @@ -71,7 +71,7 @@ documentation: | ``ConvertToTarredAudioDataset`` *(optional, only if tar-sharding is enabled)*:: - pip install lhotse "nemo-toolkit[common]==2.2.1" + pip install lhotse "nemo-toolkit[common]==2.7.2" Quick start ----------- diff --git a/docker/Dockerfile.tts_sdp b/docker/Dockerfile.tts_sdp index f174c7b1..0cb1ddb3 100644 --- a/docker/Dockerfile.tts_sdp +++ b/docker/Dockerfile.tts_sdp @@ -38,9 +38,17 @@ RUN rm -rf /src/NeMo-speech-data-processor/.git WORKDIR /src/NeMo-speech-data-processor RUN pip install -r requirements/main.txt RUN pip install -r requirements/tts.txt +RUN pip install pytest pytest-cov boto3 RUN pip install flash-attn --no-build-isolation RUN pip install https://github.com/LahiLuk/YouTokenToMe/archive/master.zip -RUN pip install megatron-core transformer_engine[pytorch]==2.4.0 -RUN pip install nemo_toolkit['all']==2.1.0 + +# newer versions of nemo do not have PunctuationCapitalizationModels +RUN pip install nemo_toolkit['all']==2.3.2 + +# nemo updates torch version, so we need to install the same version, for properly working tts +RUN python -m pip install --force-reinstall \ + torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 \ + --index-url https://download.pytorch.org/whl/cu121 +RUN python -m pip install "numpy<2.0.0" WORKDIR /src/NeMo-speech-data-processor \ No newline at end of file diff --git a/requirements/main.txt b/requirements/main.txt index 31a4a87d..ebc9f865 100644 --- a/requirements/main.txt +++ b/requirements/main.txt @@ -34,4 +34,4 @@ datasets>=2.14.0,<3.0.0 # for vLLMInference processor is required: pip install "optree>=0.13.0" vllm # for CometoidWMTQualityEstimation processor is required: pip install pymarian # for FastTextLangIdClassifier processor is required: pip install fasttext -# for ConvertToTarredAudioDatasetConfig processor can be additionally required: pip install lhotse "nemo-toolkit[common]==2.2.1" \ No newline at end of file +# for ConvertToTarredAudioDatasetConfig processor can be additionally required: pip install lhotse "nemo-toolkit[common]==2.7.2" \ No newline at end of file diff --git a/requirements/tests.txt b/requirements/tests.txt index 0f8b8675..b877e01e 100644 --- a/requirements/tests.txt +++ b/requirements/tests.txt @@ -4,5 +4,6 @@ pytest pytest-cov # lhotse requires torch and torchaudio to be present lhotse -torch -torchaudio \ No newline at end of file +torchaudio +torchcodec +fasttext \ No newline at end of file diff --git a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py index b8e419d9..ad32d40c 100644 --- a/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py +++ b/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py @@ -388,7 +388,7 @@ def _write_words(words: List[Dict]): output_words_filepath = None if self.config.inference.word_timestamps: - output_words_filepath = _write_words(output_words_filepath, sample_words) + output_words_filepath = _write_words(sample_words) return dict(segments = output_segments_filepath, words = output_words_filepath) diff --git a/sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py b/sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py index 2c734754..7988975c 100644 --- a/sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py +++ b/sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py @@ -71,7 +71,7 @@ from nemo.collections.asr.data import feature_to_text_dataset from nemo.collections.asr.metrics.wer import word_error_rate -from nemo.collections.asr.models import ASRModel, EncDecClassificationModel +from nemo.collections.asr.models import ASRModel, EncDecClassificationModel, EncDecFrameClassificationModel from nemo.collections.asr.parts.submodules.ctc_decoding import CTCDecodingConfig from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest @@ -79,8 +79,6 @@ generate_overlap_vad_seq, generate_vad_segment_table, get_vad_stream_status, - init_frame_vad_model, - init_vad_model, ) from nemo.core.config import hydra_runner from nemo.utils import logging @@ -246,9 +244,9 @@ def extract_audio_features(manifest_filepath: str, cfg: DictConfig, record_fn: C out_dir.mkdir(parents=True, exist_ok=True) torch.set_grad_enabled(False) if cfg.vad_model: - vad_model = init_frame_vad_model(cfg.vad_model) + vad_model = init_frame_vad_model(cfg.vad_model, strict=False) else: - vad_model = EncDecClassificationModel.from_pretrained("vad_multilingual_marblenet") + vad_model = EncDecClassificationModel.from_pretrained("vad_multilingual_marblenet", strict=False) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") vad_model = vad_model.to(device) vad_model.eval() @@ -477,15 +475,43 @@ def generate_vad_frame_pred( return out_dir -def init_asr_model(model_path: str) -> ASRModel: +def init_frame_vad_model(model_path: str, strict: bool = False) -> EncDecFrameClassificationModel: + """ + Initiate VAD model with model path + """ + if model_path.endswith('.nemo'): + logging.info(f"Using local VAD model from {model_path}") + vad_model = EncDecFrameClassificationModel.restore_from(restore_path=model_path, strict=strict) + elif model_path.endswith('.ckpt'): + vad_model = EncDecFrameClassificationModel.load_from_checkpoint(checkpoint_path=model_path, strict=strict) + else: + logging.info(f"Using NGC cloud VAD model {model_path}") + vad_model = EncDecFrameClassificationModel.from_pretrained(model_name=model_path, strict=strict) + return vad_model + +def init_vad_model(model_path: str, strict: bool = False) -> EncDecClassificationModel: + """ + Initiate VAD model with model path + """ + if model_path.endswith('.nemo'): + logging.info(f"Using local VAD model from {model_path}") + vad_model = EncDecClassificationModel.restore_from(restore_path=model_path, strict=strict) + elif model_path.endswith('.ckpt'): + vad_model = EncDecClassificationModel.load_from_checkpoint(checkpoint_path=model_path, strict=strict) + else: + logging.info(f"Using NGC cloud VAD model {model_path}") + vad_model = EncDecClassificationModel.from_pretrained(model_name=model_path, strict=strict) + return vad_model + +def init_asr_model(model_path: str, strict: bool = True) -> ASRModel: if model_path.endswith('.nemo'): logging.info(f"Using local ASR model from {model_path}") - asr_model = ASRModel.restore_from(restore_path=model_path) + asr_model = ASRModel.restore_from(restore_path=model_path, strict=strict) elif model_path.endswith('.ckpt'): - asr_model = ASRModel.load_from_checkpoint(checkpoint_path=model_path) + asr_model = ASRModel.load_from_checkpoint(checkpoint_path=model_path, strict=False) else: logging.info(f"Using NGC ASR model {model_path}") - asr_model = ASRModel.from_pretrained(model_name=model_path) + asr_model = ASRModel.from_pretrained(model_name=model_path, strict=strict) return asr_model diff --git a/sdp/processors/inference/llm/vllm/vllm.py b/sdp/processors/inference/llm/vllm/vllm.py index 9ef9e89c..041590ae 100644 --- a/sdp/processors/inference/llm/vllm/vllm.py +++ b/sdp/processors/inference/llm/vllm/vllm.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import yaml import json + +import yaml from tqdm import tqdm from sdp.processors.base_processor import BaseProcessor @@ -53,7 +54,7 @@ class vLLMInference(BaseProcessor): - model: https://docs.vllm.ai/en/latest/api/vllm/index.html#vllm.LLM - inference: https://docs.vllm.ai/en/v0.6.4/dev/sampling_params.html - - apply_chat_template: https://huggingface.co/docs/transformers/main/en/chat_templating#applychattemplate + - apply_chat_template: https://huggingface.co/docs/transformers/main/en/chat_templating Make sure to install `optree>=0.13.0` and `vllm` before using this processor: pip install "optree>=0.13.0" vllm @@ -70,8 +71,8 @@ def __init__(self, apply_chat_template: dict = {}, **kwargs): - from vllm import SamplingParams from transformers import AutoTokenizer + from vllm import SamplingParams super().__init__(**kwargs) diff --git a/sdp/processors/toloka/accept_if.py b/sdp/processors/toloka/accept_if.py index 8472f601..4f092f51 100644 --- a/sdp/processors/toloka/accept_if.py +++ b/sdp/processors/toloka/accept_if.py @@ -14,6 +14,7 @@ import json import os +import warnings from collections import defaultdict from typing import Optional @@ -73,6 +74,15 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) + self.input_data_file = input_data_file self.input_pool_file = input_pool_file self.threshold = threshold diff --git a/sdp/processors/toloka/create_pool.py b/sdp/processors/toloka/create_pool.py index 9948cef4..75daed4e 100644 --- a/sdp/processors/toloka/create_pool.py +++ b/sdp/processors/toloka/create_pool.py @@ -15,6 +15,7 @@ import datetime import json import os +import warnings from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor @@ -55,7 +56,17 @@ def __init__( lang : str, optional The language filter for the pool. Defaults to 'HY'. """ + super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) + self.API_KEY = os.getenv('TOLOKA_API_KEY') if not self.API_KEY: raise ValueError("TOLOKA_API_KEY environment variable is not set") diff --git a/sdp/processors/toloka/create_project.py b/sdp/processors/toloka/create_project.py index bf8ece19..a9d01102 100644 --- a/sdp/processors/toloka/create_project.py +++ b/sdp/processors/toloka/create_project.py @@ -14,6 +14,7 @@ import json import os +import warnings from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry @@ -52,6 +53,15 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) + self.API_KEY = os.getenv('TOLOKA_API_KEY') if not self.API_KEY: raise ValueError("TOLOKA_API_KEY environment variable is not set") diff --git a/sdp/processors/toloka/create_sentence_set.py b/sdp/processors/toloka/create_sentence_set.py index 8a86afb6..8141f92d 100644 --- a/sdp/processors/toloka/create_sentence_set.py +++ b/sdp/processors/toloka/create_sentence_set.py @@ -14,6 +14,7 @@ import json import os +import warnings from docx import Document @@ -34,6 +35,14 @@ class CreateSentenceSet(BaseParallelProcessor): """ def __init__(self, **kwargs): super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) def parse_docx(self, file_path): doc = Document(file_path) diff --git a/sdp/processors/toloka/create_task_set.py b/sdp/processors/toloka/create_task_set.py index 3957091f..8e58ca5a 100644 --- a/sdp/processors/toloka/create_task_set.py +++ b/sdp/processors/toloka/create_task_set.py @@ -14,6 +14,7 @@ import json import os +import warnings from typing import List, Optional from sdp.logging import logger @@ -53,6 +54,15 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) + self.input_data_file = input_data_file self.input_pool_file = input_pool_file self.limit = limit diff --git a/sdp/processors/toloka/download_responses.py b/sdp/processors/toloka/download_responses.py index aa2563cf..2fcfe48d 100644 --- a/sdp/processors/toloka/download_responses.py +++ b/sdp/processors/toloka/download_responses.py @@ -14,6 +14,7 @@ import json import os +import warnings from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry @@ -82,6 +83,15 @@ def __init__( The ID of the pool from which results will be retrieved. Defaults to None. """ super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) + self.input_data_file = input_data_file self.input_pool_file = input_pool_file self.output_dir = output_dir diff --git a/sdp/processors/toloka/reject_if.py b/sdp/processors/toloka/reject_if.py index 182c3e86..35057942 100644 --- a/sdp/processors/toloka/reject_if.py +++ b/sdp/processors/toloka/reject_if.py @@ -14,6 +14,7 @@ import json import os +import warnings from sdp.logging import logger from sdp.processors.base_processor import BaseParallelProcessor, DataEntry @@ -77,6 +78,15 @@ def __init__( The ID of the pool from which assignments will be retrieved. Defaults to None. """ super().__init__(**kwargs) + + # Deprecation warning + warnings.warn( + "Toloka processors are deprecated and will be removed in a future version. " + "Please migrate to alternative solutions for crowdsourcing tasks.", + DeprecationWarning, + stacklevel=2 + ) + self.input_data_file = input_data_file self.input_pool_file = input_pool_file self.config_file = config_file diff --git a/sdp/processors/tts/nemo_asr_align.py b/sdp/processors/tts/nemo_asr_align.py index 9a71c476..19f426db 100644 --- a/sdp/processors/tts/nemo_asr_align.py +++ b/sdp/processors/tts/nemo_asr_align.py @@ -119,7 +119,7 @@ def get_alignments_text(self, hypotheses): - list: List of dictionaries with word alignments (word, start, end) - str: The transcribed text """ - timestamp_dict = hypotheses.timestep # extract timesteps from hypothesis of first (and only) audio file + timestamp_dict = hypotheses.timestamp # extract timesteps from hypothesis of first (and only) audio file # For a FastConformer model, you can display the word timestamps as follows: # 80ms is duration of a timestep at output of the Conformer diff --git a/sdp/processors/tts/text.py b/sdp/processors/tts/text.py index 37dbb862..b259181f 100644 --- a/sdp/processors/tts/text.py +++ b/sdp/processors/tts/text.py @@ -15,7 +15,6 @@ import json from sdp.processors.base_processor import BaseProcessor, BaseParallelProcessor, DataEntry from sdp.utils.common import load_manifest, save_manifest -from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo.collections.nlp.models import PunctuationCapitalizationModel class InverseTextNormalizationProcessor(BaseParallelProcessor): @@ -42,6 +41,7 @@ class InverseTextNormalizationProcessor(BaseParallelProcessor): def __init__(self, language="en", **kwargs): + from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer super().__init__(**kwargs) self.normalizer = InverseNormalizer(lang=language) diff --git a/tests/test_cfg_end_to_end_tests.py b/tests/test_cfg_end_to_end_tests.py index aa0fde74..c7b49fa0 100644 --- a/tests/test_cfg_end_to_end_tests.py +++ b/tests/test_cfg_end_to_end_tests.py @@ -288,20 +288,20 @@ def get_test_cases() -> List[Tuple[str, Callable]]: config_path=f"{DATASET_CONFIGS_ROOT}/arabic/everyayah/config.yaml", data_check_fn=partial(data_check_fn_generic, file_name="everyayah.hf") ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", - data_check_fn=data_check_fn_armenian_toloka_pipeline_start, - fields_to_ignore=['source_filepath'], - processors_to_run="2:14", - reference_manifest_filename="pipeline_start/test_data_reference.json" - ), - TestCase( - config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", - data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, - reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", - fields_to_ignore=['audio_filepath', 'duration'], - processors_to_run="1:6" - ), + # TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_start.yaml", + # data_check_fn=data_check_fn_armenian_toloka_pipeline_start, + # fields_to_ignore=['source_filepath'], + # processors_to_run="2:14", + # reference_manifest_filename="pipeline_start/test_data_reference.json" + # ), + # TestCase( + # config_path=f"{DATASET_CONFIGS_ROOT}/armenian/toloka/pipeline_get_final_res.yaml", + # data_check_fn=data_check_fn_armenian_toloka_pipeline_get_final_res, + # reference_manifest_filename="pipeline_get_final_res/test_data_reference.json", + # fields_to_ignore=['audio_filepath', 'duration'], + # processors_to_run="1:6" + # ), TestCase( config_path=f"{DATASET_CONFIGS_ROOT}/portuguese/unlabeled/config.yaml", data_check_fn=partial(data_check_fn_unlabeled), @@ -498,4 +498,4 @@ def test_configs(setup_data, tmp_path): shutil.rmtree(tmp_path) if __name__ == "__main__": - pytest.main([__file__, "-v", "--durations=0"]) + pytest.main([__file__, "-v", "--durations=0"]) \ No newline at end of file