diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..48dadd7 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,55 @@ +name: Tests + +on: + push: + branches: ["main", "master"] + pull_request: + workflow_dispatch: + +jobs: + test: + name: Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + cache-dependency-glob: "pyproject.toml" + + - name: Generate proto stubs and install package + run: ./install.sh + + - name: Run tests via tox + run: | + TOXENV="py$(echo '${{ matrix.python-version }}' | tr -d '.')" + uvx --with "tox-uv>=1" tox -e "${TOXENV}" + + test-py314: + name: Python 3.14 (allowed failure) + runs-on: ubuntu-latest + continue-on-error: true + + steps: + - uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + cache-dependency-glob: "pyproject.toml" + + - name: Generate proto stubs and install package + run: ./install.sh + + - name: Run tests via tox + run: uvx --with "tox-uv>=1" tox -e py314 diff --git a/.gitignore b/.gitignore index ccdbf67..6262d55 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,10 @@ build/ *.egg-info/ .eggs/ +.tox/ google/ __pycache__/ techmo/ .venv/ .vscode/ +pre-commit/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..20f982f --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "submodules/asr-api"] + path = submodules/asr-api + url = https://github.com/techmo-pl/asr-api.git diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c71927..d75ed34 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,28 @@ # Changelog of ASR API (Python) -## [1.0.0] - 2024-08-14 + +## [1.1.4] - 2026-03-22 + +### Fixed + +- `install.sh`: added `export PATH="$HOME/.local/bin:$PATH"` so that `uv` is found on runners where it is installed locally rather than system-wide. +- `VERSION.py`: corrected version string (was not updated when 1.1.3 was tagged). + +### Changed + +- `setup.py`: replaced `pkg_resources` with importlib-compatible path resolution; removed upper bound on setuptools; removed upper bound on grpcio-tools build requirement. +- `pyproject.toml`: removed upper bound on grpcio and protobuf runtime requirements; added Python-version markers to guard Python 3.8 users from grpcio>=1.71.0 and protobuf>=6.0.0; grpcio bounds set to `>=1.49.4,<1.71.0` for Python 3.8 and `>=1.49.4` for 3.9+; protobuf bounds set to `>=4.21.3,<6`; `requires-python` lowered to `>=3.8`; introduced upper bound on setuptools below 82; added `pip<26` constraint. +- `tox.ini`, `install.sh`: introduced uv-based multi-version testing (Python 3.8–3.14); replaced Docker-based single-version test with tox multi-version matrix. +- `submodules/asr-api`: updated to v1.1.1; restructured from committed proto files to a submodule. +- `asr_api/`: support for _techmo.asr.api.v1p1_ API. +- `tests/`: attribute check for _techmo.asr.api.v1p1_ API. + + +## [1.0.0] - 2024-01-29 ### Added -- _asr_api_ package - - support for _techmo.asr.api.dictation_ API - - support for _techmo.asr.api.v1_ API -- Setuptools configuration +- `asr_api/`: support for _techmo.asr.api.dictation_ and _techmo.asr.api.v1_ APIs. +- `pyproject.toml`: setuptools configuration. +- `tests/`: attribute checks for _techmo.asr.api.dictation_ and _techmo.asr.api.v1_ APIs; coverage report. +- `submodules/asr-api`: asr-api v1.0.0. diff --git a/README.md b/README.md index 2b90047..38d64d9 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,16 @@ # ASR API (Python) -The collection of gRPC APIs for Techmo ASR supplied as a Python package. +The collection of gRPC APIs for Techmo ASR solutions supplied as a Python package. ## Setup -The project can be used as-is and does not require any additional setup. +Run once after cloning to initialise the submodule: -## Requirements +```sh +./setup.sh +``` + +### Requirements - [Python](https://www.python.org/) >=3.8 @@ -23,6 +27,23 @@ pip install --require-virtualenv --upgrade pip pip install --require-virtualenv . ``` +*For basic development use, consider convenient `./install.sh`.* + +## Running tests + +Proto stubs must be generated before running tests. Use `./install.sh` once, then invoke tox: + +```sh +./install.sh +uvx --with "tox-uv>=1" tox +``` + +To run a single Python version: + +```sh +uvx --with "tox-uv>=1" tox -e py312 +``` + ## Usage ### Import @@ -31,6 +52,14 @@ The package provides a precompiled collection of `.proto` files that can be impo Example: +- direct import + +```python +>>> from techmo.asr.api.v1p1 import asr_pb2 as api +>>> hasattr(api, "StreamingRecognizeRequest") +True +``` + - import from an alias module ```python diff --git a/VERSION.md b/VERSION.md deleted file mode 100644 index 3eefcb9..0000000 --- a/VERSION.md +++ /dev/null @@ -1 +0,0 @@ -1.0.0 diff --git a/asr_api/VERSION.py b/asr_api/VERSION.py index 5becc17..c72e379 100644 --- a/asr_api/VERSION.py +++ b/asr_api/VERSION.py @@ -1 +1 @@ -__version__ = "1.0.0" +__version__ = "1.1.4" diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..490190a --- /dev/null +++ b/install.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# +# usage: ./install.sh [VENV_PATH] +# +# VENV_PATH: Optional path for the virtual environment (default: ./.venv). +# +# Creates a virtualenv with uv and installs the package with test dependencies. + +set -euo pipefail + +VENV_PATH="${1:-.venv}" + +if [ ! -d "${VENV_PATH}" ]; then + uv venv "${VENV_PATH}" +fi + +# shellcheck disable=SC1091 +source "${VENV_PATH}/bin/activate" +uv pip install -e ".[tests]" diff --git a/proto/google/rpc/status.proto b/proto/google/rpc/status.proto deleted file mode 100644 index 923e169..0000000 --- a/proto/google/rpc/status.proto +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2022 Google LLC -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -syntax = "proto3"; - -package google.rpc; - -import "google/protobuf/any.proto"; - -option cc_enable_arenas = true; -option go_package = "google.golang.org/genproto/googleapis/rpc/status;status"; -option java_multiple_files = true; -option java_outer_classname = "StatusProto"; -option java_package = "com.google.rpc"; -option objc_class_prefix = "RPC"; - -// The `Status` type defines a logical error model that is suitable for -// different programming environments, including REST APIs and RPC APIs. It is -// used by [gRPC](https://github.com/grpc). Each `Status` message contains -// three pieces of data: error code, error message, and error details. -// -// You can find out more about this error model and how to work with it in the -// [API Design Guide](https://cloud.google.com/apis/design/errors). -message Status { - // The status code, which should be an enum value of - // [google.rpc.Code][google.rpc.Code]. - int32 code = 1; - - // A developer-facing error message, which should be in English. Any - // user-facing error message should be localized and sent in the - // [google.rpc.Status.details][google.rpc.Status.details] field, or localized - // by the client. - string message = 2; - - // A list of messages that carry the error details. There is a common set of - // message types for APIs to use. - repeated google.protobuf.Any details = 3; -} diff --git a/proto/techmo/api/status.proto b/proto/techmo/api/status.proto deleted file mode 100644 index 71a21ca..0000000 --- a/proto/techmo/api/status.proto +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright 2023 Techmo sp. z o.o. - -syntax = "proto3"; - -package techmo.api; - -import "google/protobuf/any.proto"; - - -// -message Status { - // - int32 code = 1; - - // - string message = 2; - - // - repeated google.protobuf.Any details = 3; -} diff --git a/proto/techmo/asr/api/dictation/asr.proto b/proto/techmo/asr/api/dictation/asr.proto deleted file mode 100644 index 2380707..0000000 --- a/proto/techmo/asr/api/dictation/asr.proto +++ /dev/null @@ -1,782 +0,0 @@ -// Copyright 2018 Google LLC. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Modified by Techmo, copyright by Google. Changes include: -// 1. Additions that introduce new features to the original API. Extensions -// (parts that were added to the original document) by Techmo are marked with -// [**Extension by Techmo**] tag. -// - `MP3` audio encoding type. -// - `ConfigField` as means to provide additional configuration. -// - `ResultFinalizationCause` as means to indicate MRCPv2-related recognition -// result finalization cause. -// - `RecognitionLattice` and `LatticeEdge` as means to return detailed -// recognition results. -// - `Age` and `Gender` as means to provide age and gender recognition results -// in `SpeechRecognitionResult` and `StreamingRecognitionResult`. -// 2. Modifications of comments, according to how recognition is performed by Techmo. -// - [*Unused*] tags for fields or values that are not used (ignored when -// provided in request, never returned in response). -// - [*Unsupported*] tags for fields or values that will result in an error -// when provided in request. -// 3. Removal of `LongRunningRecognize` support (commented out). - -syntax = "proto3"; - -package google.cloud.speech.v1; - -// import "google/api/annotations.proto"; -// import "google/longrunning/operations.proto"; -// import "google/protobuf/any.proto"; -import "google/protobuf/duration.proto"; -// import "google/protobuf/empty.proto"; -// import "google/protobuf/timestamp.proto"; -import "google/rpc/status.proto"; - -option cc_enable_arenas = true; -option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech"; -option java_multiple_files = true; -option java_outer_classname = "SpeechProto"; -option java_package = "com.google.cloud.speech.v1"; - - -// Service that implements Google Cloud Speech API extended by Techmo. -service Speech { - // Performs synchronous speech recognition: receive results after all audio - // has been sent and processed. - rpc Recognize(RecognizeRequest) returns (RecognizeResponse) { - // option (google.api.http) = { - // post: "/v1/speech:recognize" - // body: "*" - // }; - } - - // // Performs asynchronous speech recognition: receive results via the - // // google.longrunning.Operations interface. Returns either an - // // `Operation.error` or an `Operation.response` which contains - // // a `LongRunningRecognizeResponse` message. - // rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) { - // option (google.api.http) = { - // post: "/v1/speech:longrunningrecognize" - // body: "*" - // }; - // } - - // Performs bidirectional streaming speech recognition: receive results while - // sending audio. This method is only available via the gRPC API (not REST). - rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) { - } -} - -// The top-level message sent by the client for the `Recognize` method. -message RecognizeRequest { - // [*Required*] Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1; - - // [*Required*] The audio data to be recognized. - RecognitionAudio audio = 2; -} - -// // The top-level message sent by the client for the `LongRunningRecognize` -// // method. -// message LongRunningRecognizeRequest { -// // [*Required*] Provides information to the recognizer that specifies how to -// // process the request. -// RecognitionConfig config = 1; - -// // [*Required*] The audio data to be recognized. -// RecognitionAudio audio = 2; -// } - -// The top-level message sent by the client for the `StreamingRecognize` method. -// Multiple `StreamingRecognizeRequest` messages are sent. The first message -// must contain a `streaming_config` message and must not contain `audio` data. -// All subsequent messages must contain `audio` data and must not contain a -// `streaming_config` message. -message StreamingRecognizeRequest { - // The streaming request, which is either a streaming config or audio content. - oneof streaming_request { - // Provides information to the recognizer that specifies how to process the - // request. The first `StreamingRecognizeRequest` message must contain a - // `streaming_config` message. - StreamingRecognitionConfig streaming_config = 1; - - // The audio data to be recognized. Sequential chunks of audio data are sent - // in sequential `StreamingRecognizeRequest` messages. The first - // `StreamingRecognizeRequest` message must not contain `audio_content` data - // and all subsequent `StreamingRecognizeRequest` messages must contain - // `audio_content` data. The audio bytes must be encoded as specified in - // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a - // pure binary representation (not base64). - bytes audio_content = 2; - - // @exclude See [content limits](/speech-to-text/quotas#content). - - // [**Extension by Techmo**] - // Another experimental feature from MRCPv2. - // https://www.rfc-editor.org/rfc/rfc6787.html#section-9.13 - bool start_input_timers = 3; - } -} - -// Provides information to the recognizer that specifies how to process the -// request. -message StreamingRecognitionConfig { - // [*Required*] Provides information to the recognizer that specifies how to - // process the request. - RecognitionConfig config = 1; - - // [*Optional*] If `false` or omitted, the recognizer will perform continuous - // recognition (continuing to wait for and process audio even if the user - // pauses speaking) until the client closes the input stream (gRPC API) or - // until the maximum time limit has been reached. May return multiple - // `StreamingRecognitionResult`s with the `is_final` flag set to `true`. - // If `true`, the recognizer will detect a single spoken utterance. When it - // detects that the user has paused or stopped speaking, it will return an - // `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no - // more than one `StreamingRecognitionResult` with the `is_final` flag set to - // `true`. - bool single_utterance = 2; - - // [*Optional*] If `true`, interim results (tentative hypotheses) may be - // returned as they become available (these interim results are indicated with - // the `is_final=false` flag). - // If `false` or omitted, only `is_final=true` result(s) are returned. - bool interim_results = 3; - - // [**Extension by Techmo**] - // Another experimental feature from MRCPv2. - // https://www.rfc-editor.org/rfc/rfc6787.html#section-9.4.14 - optional bool start_input_timers = 4; -} - -// Provides information to the recognizer that specifies how to process the -// request. -message RecognitionConfig { - - // @exclude The encoding of the audio data sent in the request. - // - // All encodings support only 1 channel (mono) audio. - // - // For best results, the audio source should be captured and transmitted using - // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech - // recognition can be reduced if lossy codecs are used to capture or transmit - // audio, particularly if background noise is present. Lossy codecs include - // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`. - // - // The `FLAC` and `WAV` audio file formats include a header that describes the - // included audio content. You can request recognition for `WAV` files that - // contain either `LINEAR16` or `MULAW` encoded audio. - // If you send `FLAC` or `WAV` audio file format in - // your request, you do not need to specify an `AudioEncoding`; the audio - // encoding format is determined from the file header. If you specify - // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the - // encoding configuration must match the encoding described in the audio - // header; otherwise the request returns an - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code. - - // The encoding of the audio data sent in the request. - // All encodings support only 1 channel (mono) audio. - enum AudioEncoding { - // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. - ENCODING_UNSPECIFIED = 0; - - // Uncompressed 16-bit signed little-endian samples (Linear PCM). - LINEAR16 = 1; - - // `FLAC` (Free Lossless Audio - // Codec) is the recommended encoding because it is - // lossless--therefore recognition is not compromised--and - // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream - // encoding supports 16-bit and 24-bit samples, however, not all fields in - // `STREAMINFO` are supported. - // [**Extension by Techmo**] Supported only by `Recognize`. When requested by `StreamingRecognize`, will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead. - FLAC = 2; - - // @exclude 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law. - - // [*Unsupported*] - MULAW = 3; - - // @exclude Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000. - - // [*Unsupported*] - AMR = 4; - - // @exclude Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000. - - // [*Unsupported*] - AMR_WB = 5; - - // Opus encoded audio frames in Ogg container - // ([OggOpus](https://wiki.xiph.org/OggOpus)). - // [**Extension by Techmo**] Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead. - OGG_OPUS = 6; - - // @exclude `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000. - - // @exclude Although the use of lossy encodings is not recommended, if a very low - // bitrate encoding is required, `OGG_OPUS` is highly preferred over - // Speex encoding. The [Speex](https://speex.org/) encoding supported by - // Cloud Speech API has a header byte in each block, as in MIME type - // `audio/x-speex-with-header-byte`. - // It is a variant of the RTP Speex encoding defined in - // [RFC 5574](https://tools.ietf.org/html/rfc5574). - // The stream is a sequence of blocks, one block per RTP packet. Each block - // starts with a byte containing the length of the block, in bytes, followed - // by one or more frames of Speex data, padded to an integral number of - // bytes (octets) as specified in RFC 5574. In other words, each RTP header - // is replaced with a single byte containing the block length. Only Speex - // wideband is supported. `sample_rate_hertz` must be 16000. - - // [*Unsupported*] - SPEEX_WITH_HEADER_BYTE = 7; - - // [**Extension by Techmo**] `MP3` (standards ISO/IEC 11172-3 and ISO/IEC 13818-3) Only constant bit rate files are accepted. Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead. - MP3 = 8; - } - - // [*Required*] Encoding of audio data sent in all `RecognitionAudio` messages. - AudioEncoding encoding = 1; - - // @exclude Encoding of audio data sent in all `RecognitionAudio` messages. - // This field is optional for `FLAC` and `WAV` audio files and required - // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. - - // [*Required*] Sample rate in Hertz of the audio data sent in all - // `RecognitionAudio` messages. Valid values are: 8000-48000. - // 16000 is optimal. For best results, set the sampling rate of the audio - // source to 16000 Hz. If that's not possible, use the native sample rate of - // the audio source (instead of re-sampling). - // [**Extension by Techmo**] Silently ignored for `FLAC`, `OGG_OPUS` and `MP3` encodings. Real sample rate will be detected from file header instead. - int32 sample_rate_hertz = 2; - - // @exclude This field is optional for `FLAC` and `WAV` audio files and required - // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding]. - - // [*Required*] The language of the supplied audio as a - // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag. - // Example: "en-US". - // The only language supported at the moment is Polish (`pl-PL`). - string language_code = 3; - - // @exclude See [Language Support](/speech-to-text/docs/languages) - // for a list of the currently supported language codes. - - // [*Optional*] Maximum number of recognition hypotheses to be returned. - // Specifically, the maximum number of `SpeechRecognitionAlternative` messages - // within each `SpeechRecognitionResult`. - // The server may return fewer than `max_alternatives`. - // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of - // one. If omitted, will return a maximum of one. - int32 max_alternatives = 4; - - // @exclude [*Optional*] If set to `true`, the server will attempt to filter out - // profanities, replacing all but the initial character in each filtered word - // with asterisks, e.g. "f***". If set to `false` or omitted, profanities - // won't be filtered out. - - // [*Optional*][*Unused*] - bool profanity_filter = 5; - - // @exclude [*Optional*] array of [SpeechContext][google.cloud.speech.v1.SpeechContext]. - // First element of the array is used to identify context model to be used - // in current recognition. - - // [*Optional*] - repeated SpeechContext speech_contexts = 6; - - // [*Optional*] If `true`, the top result includes a list of words and - // the start and end time offsets (timestamps) for those words. If - // `false`, no word-level time offset information is returned. The default is - // `false`. - bool enable_word_time_offsets = 8; - - // @exclude [*Optional*] If 'true', adds punctuation to recognition result hypotheses. - // This feature is only available in select languages. Setting this for - // requests in other languages has no effect at all. - // The default 'false' value does not add punctuation to result hypotheses. - // Note: This is currently offered as an experimental service, complimentary - // to all users. In the future this may be exclusively available as a - // premium feature. - - // [*Optional*][*Unused*] - bool enable_automatic_punctuation = 11; - - // [**Extension by Techmo**] - // [*Optional*] A means to provide additional configuration fields via request. - repeated ConfigField config_fields = 12; - - // @exclude [*Optional*] Which model to select for the given request. Select the model - // best suited to your domain to get best results. If a model is not - // explicitly specified, then we auto-select a model based on the parameters - // in the RecognitionConfig. - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - // - //
ModelDescription
command_and_searchBest for short queries such as voice commands or voice search.
phone_callBest for audio that originated from a phone call (typically - // recorded at an 8khz sampling rate).
videoBest for audio that originated from from video or includes multiple - // speakers. Ideally the audio is recorded at a 16khz or greater - // sampling rate. This is a premium model that costs more than the - // standard rate.
defaultBest for audio that is not one of the specific audio models. - // For example, long-form audio. Ideally the audio is high-fidelity, - // recorded at a 16khz or greater sampling rate.
- - // [*Optional*][*Unused*] - string model = 13; - - // @exclude [*Optional*] Set to true to use an enhanced model for speech recognition. - // You must also set the `model` field to a valid, enhanced model. If - // `use_enhanced` is set to true and the `model` field is not set, then - // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced - // version of the specified model does not exist, then the speech is - // recognized using the standard version of the specified model. - // - // Enhanced speech models require that you opt-in to data logging using - // instructions in the [documentation](/speech-to-text/enable-data-logging). - // If you set `use_enhanced` to true and you have not enabled audio logging, - // then you will receive an error. - - // [*Optional*][*Unused*] - bool use_enhanced = 14; - - // [**Extension by Techmo**][*Optional*] Gender and age recognition parameters - SpeechDurationConfig speech_duration_gender_recognition = 15; - - // [**Extension by Techmo**][*Optional*] Gender and age recognition parameters - SpeechDurationConfig speech_duration_age_recognition = 16; -} - -// [**Extension by Techmo**] Gender and age recognition parameters. -message SpeechDurationConfig { - // The way in which service decides when to start recognition. - SpeechDurationThresholdMode speech_duration_threshold_mode = 3; - - // The minimum duration of speech in `audio` required to start recognition, in ms. - // Ignored, unless `speech_duration_threshold_mode` is `CUSTOM`. - uint32 speech_duration_threshold_ms = 4; -} - -// [**Extension by Techmo**] -// The possible ways for a service to decide when to start recognition -// depending on a duration of speech in `audio`. -enum SpeechDurationThresholdMode { - // Use an implementation-defined threshold value carefully tuned to obtain best results. - DEFAULT = 0; - - // Use an user-defined threshold value provided in the configuration message. - CUSTOM = 1; - - // Disable early start of recognition and wait for the entire audio data. - DISABLED = 2; -} - - -// @exclude Provides "hints" to the speech recognizer to favor specific words and phrases -// in the results. - -message SpeechContext { - // @exclude [*Optional*] Can be used to send a context phrase that switches the model - // used during recognition. If the phrase correctly identifies the context model - // used in service, it will be used instead of the general model for the current recognition. - // Due to compatibility with Google API, the object is defined as a list of strings, - // but only the first element of the list is used as the context phrase, - // the rest are ignored if present. - - repeated string phrases = 1; -} - -// [**Extension by Techmo**] -// Provides a pair of configuration field name and value. -message ConfigField { - // Name of configuration field. - string key = 1; - - // Value of configuration field. - string value = 2; -} - -// @exclude Contains audio data in the encoding specified in the `RecognitionConfig`. -// Either `content` or `uri` must be supplied. Supplying both or neither -// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See -// [audio limits](https://cloud.google.com/speech/limits#content). - -// Contains audio data in the encoding specified in the `RecognitionConfig`. -// Only `content` is allowed to be supplied. -message RecognitionAudio { - - // @exclude The audio source, which is either inline content or a Google Cloud - // Storage uri. - - // The audio source, which is inline content. - oneof audio_source { - // The audio data bytes encoded as specified in - // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a - // pure binary representation, whereas JSON representations use base64. - bytes content = 1; - - // @exclude URI that points to a file that contains audio data bytes as specified in - // `RecognitionConfig`. The file must not be compressed (for example, gzip). - // Currently, only Google Cloud Storage URIs are - // supported, which must be specified in the following format: - // `gs://bucket_name/object_name` (other URI formats return - // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see - // [Request URIs](https://cloud.google.com/storage/docs/reference-uris). - - // [*Unsupported*] - string uri = 2; - } -} - -// The only message returned to the client by the `Recognize` method. It -// contains the result as zero or more sequential `SpeechRecognitionResult` -// messages. -message RecognizeResponse { - // [*Output only*] Sequential list of transcription results corresponding to - // sequential portions of audio. - repeated SpeechRecognitionResult results = 2; -} - -// // The only message returned to the client by the `LongRunningRecognize` method. -// // It contains the result as zero or more sequential `SpeechRecognitionResult` -// // messages. It is included in the `result.response` field of the `Operation` -// // returned by the `GetOperation` call of the `google::longrunning::Operations` -// // service. -// message LongRunningRecognizeResponse { -// // [*Output only*] Sequential list of transcription results corresponding to -// // sequential portions of audio. -// repeated SpeechRecognitionResult results = 2; -// } - -// // Describes the progress of a long-running `LongRunningRecognize` call. It is -// // included in the `metadata` field of the `Operation` returned by the -// // `GetOperation` call of the `google::longrunning::Operations` service. -// message LongRunningRecognizeMetadata { -// // Approximate percentage of audio processed thus far. Guaranteed to be 100 -// // when the audio is fully processed and the results are available. -// int32 progress_percent = 1; - -// // Time when the request was received. -// google.protobuf.Timestamp start_time = 2; - -// // Time of the most recent processing update. -// google.protobuf.Timestamp last_update_time = 3; -// } - -// `StreamingRecognizeResponse` is the only message returned to the client by -// `StreamingRecognize`. A series of one or more `StreamingRecognizeResponse` -// messages are streamed back to the client. -// -// Here's an example of a series of ten `StreamingRecognizeResponse`s that might -// be returned while processing audio: -// -// 1. results { alternatives { transcript: "tube" } stability: 0.01 } -// -// 2. results { alternatives { transcript: "to be a" } stability: 0.01 } -// -// 3. results { alternatives { transcript: "to be" } stability: 0.9 } -// results { alternatives { transcript: " or not to be" } stability: 0.01 } -// -// 4. results { alternatives { transcript: "to be or not to be" -// confidence: 0.92 } -// alternatives { transcript: "to bee or not to bee" } -// is_final: true } -// -// 5. results { alternatives { transcript: " that's" } stability: 0.01 } -// -// 6. results { alternatives { transcript: " that is" } stability: 0.9 } -// results { alternatives { transcript: " the question" } stability: 0.01 } -// -// 7. results { alternatives { transcript: " that is the question" -// confidence: 0.98 } -// alternatives { transcript: " that was the question" } -// is_final: true } -// -// Notes: -// -// - Only two of the above responses #4 and #7 contain final results; they are -// indicated by `is_final: true`. Concatenating these together generates the -// full transcript: "to be or not to be that is the question". -// -// - The others contain interim `results`. #3 and #6 contain two interim -// `results`: the first portion has a high stability and is less likely to -// change; the second portion has a low stability and is very likely to -// change. A UI designer might choose to show only high stability `results`. -// -// - The specific `stability` and `confidence` values shown above are only for -// illustrative purposes. Actual values may vary. -// -// - In each response, only one of these fields will be set: -// `error`, -// `speech_event_type`, or -// one or more (repeated) `results`. -message StreamingRecognizeResponse { - // Indicates the type of speech event. - enum SpeechEventType { - // No speech event specified. - SPEECH_EVENT_UNSPECIFIED = 0; - - // This event indicates that the server has detected the end of the user's - // speech utterance and expects no additional speech. Therefore, the server - // will not process additional audio (although it may subsequently return - // additional results). The client should stop sending additional audio - // data, half-close the gRPC connection, and wait for any additional results - // until the server closes the gRPC connection. This event is only sent if - // `single_utterance` was set to `true`, and is not used otherwise. - END_OF_SINGLE_UTTERANCE = 1; - } - - // [*Output only*] If set, returns a [google.rpc.Status][google.rpc.Status] message that - // specifies the error for the operation. - google.rpc.Status error = 1; - - // [*Output only*] This repeated list contains zero or more results that - // correspond to consecutive portions of the audio currently being processed. - // It contains zero or one `is_final=true` result (the newly settled portion), - // followed by zero or more `is_final=false` results (the interim results). - repeated StreamingRecognitionResult results = 2; - - // [*Output only*] Indicates the type of speech event. - SpeechEventType speech_event_type = 4; -} - -// A streaming speech recognition result corresponding to a portion of the audio -// that is currently being processed. -message StreamingRecognitionResult { - // [**Extension by Techmo**] - // Indicates the cause of recognition result finalization. These are MRCPv2-related. - // See [Completion-Cause](https://tools.ietf.org/html/rfc6787#section-9.4.11). - enum ResultFinalizationCause { - // No recognition result finalization cause specified. - RESULT_FINALIZATION_CAUSE_UNSPECIFIED = 0; - - // Recognition has been finalized with a complete result - // after specified length of silence after user speech. - // See [Speech-Complete-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.15). - SUCCESS = 1; - - // Recognition has started and there was no speech detected - // for a certain period of time. - // See [No-Input-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.6). - NO_INPUT_TIMEOUT = 2; - - // Recognition has been finalized because speech was too long, with a complete result. - // See [Recognition-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.7). - SUCCESS_MAXTIME = 3; - - // Recognition has been finalized with an incomplete result - // after specified length of silence after user speech. - // See [Speech-Incomplete-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.16). - PARTIAL_MATCH = 4; - - // Recognition has been finalized because speech was too long, with no result. - // See [Recognition-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.7). - NO_MATCH_MAXTIME = 5; - } - - // [*Output only*] May contain one or more recognition hypotheses (up to the - // maximum specified in `max_alternatives`). - // These alternatives are ordered in terms of accuracy, with the top (first) - // alternative being the most probable, as ranked by the recognizer. - repeated SpeechRecognitionAlternative alternatives = 1; - - // [*Output only*] If `false`, this `StreamingRecognitionResult` represents an - // interim result that may change. If `true`, this is the final time the - // speech service will return this particular `StreamingRecognitionResult`, - // the recognizer will not return any further hypotheses for this portion of - // the transcript and corresponding audio. - bool is_final = 2; - - // @exclude [*Output only*] An estimate of the likelihood that the recognizer will not - // change its guess about this interim result. Values range from 0.0 - // (completely unstable) to 1.0 (completely stable). - // This field is only provided for interim results (`is_final=false`). - // The default of 0.0 is a sentinel value indicating `stability` was not set. - - // [*Unused*] - float stability = 3; - - // [**Extension by Techmo**] - // [*Output only*] Indicates the cause of recognition result finalization. - ResultFinalizationCause result_finalization_cause = 4; - - // [**Extension by Techmo**] - // [*Output only*] Detailed recognition result (lattice). - // Returned only when requested (`ConfigField`: return-lattice=true in - // `RecognitionConfig` Message), only for final (`is_final = true`) results, - // and only when it's allowed by licence. - // When requested and not allowed by licence, [google.rpc.Code.FAILED_PRECONDITION] - // will be returned. - repeated RecognitionLattice lattice = 5; - - // [**Extension by Techmo**] - // [*Output only*] Predicted gender of the speaker - Gender gender = 6; - - // [**Extension by Techmo**] - // [*Output only*] Predicted age of the speaker - Age age = 7; -} - -// A speech recognition result corresponding to a portion of the audio. -message SpeechRecognitionResult { - // [*Output only*] May contain one or more recognition hypotheses (up to the - // maximum specified in `max_alternatives`). - // These alternatives are ordered in terms of accuracy, with the top (first) - // alternative being the most probable, as ranked by the recognizer. - repeated SpeechRecognitionAlternative alternatives = 1; - - // [**Extension by Techmo**] - // [*Output only*] Detailed recognition result (lattice). - // Returned only when requested (`ConfigField`: return-lattice=true in - // `RecognitionConfig` Message), only for final (`is_final = true`) results, - // and only when it's allowed by licence. - // When requested and not allowed by licence, [google.rpc.Code.FAILED_PRECONDITION] - // will be returned. - repeated RecognitionLattice lattice = 5; - - // [**Extension by Techmo**] - // [*Output only*] Predicted gender of the speaker - Gender gender = 6; - - // [**Extension by Techmo**] - // [*Output only*] Predicted age of the speaker - Age age = 7; -} - -// Alternative hypotheses (a.k.a. n-best list). -message SpeechRecognitionAlternative { - // [*Output only*] Transcript text representing the words that the user spoke. - string transcript = 1; - - // [*Output only*] The confidence estimate between 0.0 and 1.0. A higher number - // indicates an estimated greater likelihood that the recognized words are - // correct. - float confidence = 2; - - // @exclude This field is set only for the top alternative of a non-streaming - // result or, of a streaming result where `is_final=true`. - // This field is not guaranteed to be accurate and users should not rely on it - // to be always provided. - // The default of 0.0 is a sentinel value indicating `confidence` was not set. - - // [*Output only*] A list of word-specific information for each recognized word. - repeated WordInfo words = 3; -} - -// Word-specific information for recognized words. Word information is only -// included in the response when certain request parameters are set, such -// as `enable_word_time_offsets`. -message WordInfo { - // @exclude [*Output only*] Time offset relative to the beginning of the audio, - // and corresponding to the start of the spoken word. - // This field is only set if `enable_word_time_offsets=true` and only - // in the top hypothesis. - // This is an experimental feature and the accuracy of the time offset can - // vary. - - // [*Output only*] Time offset relative to the beginning of the audio, - // and corresponding to the start of the spoken word. - // This field is only set if `enable_word_time_offsets=true`. - google.protobuf.Duration start_time = 1; - - // @exclude [*Output only*] Time offset relative to the beginning of the audio, - // and corresponding to the end of the spoken word. - // This field is only set if `enable_word_time_offsets=true` and only - // in the top hypothesis. - // This is an experimental feature and the accuracy of the time offset can - // vary. - - // [*Output only*] Time offset relative to the beginning of the audio, - // and corresponding to the end of the spoken word. - // This field is only set if `enable_word_time_offsets=true`. - google.protobuf.Duration end_time = 2; - - // [*Output only*] The word corresponding to this set of information. - string word = 3; -} - -// [**Extension by Techmo**] -// Detailed recognition result (lattice). -// Returned only when requested (`ConfigField`: return-lattice=true in -// `RecognitionConfig` Message), only for final (`is_final = true`) results, -// and only when it's allowed by licence. When requested and not allowed by -// licence, [google.rpc.Code.FAILED_PRECONDITION] will be returned. -message RecognitionLattice { - // List of final nodes. - repeated int32 final_nodes = 1; - - // List of lattice edges. - repeated LatticeEdge edges = 2; -} - -// [**Extension by Techmo**] -// Edge-specific information for recognition lattice. -message LatticeEdge { - // Input node ID, node '0' is starting node for the lattice. - int32 start_node = 1; - - // End node ID. - int32 end_node = 2; - - // Word. - string symbol = 3; - - // Language model cost. - float language_cost = 4; - - // Raw acoustic score (unscaled). - float acoustic_cost = 5; - - // Word duration in milliseconds. - int32 duration = 6; -} - -// [**Extension by Techmo**] -// Predicted gender of the speaker -message Gender { - // The recognized gender label. - string gender = 1; - - // The confidence in [0, 1] range, where near 0 means 'unsure' and near 1 means 'almost certain'. - float confidence = 2; -} - -// [**Extension by Techmo**] -// Predicted age of the speaker -message Age { - // The recognized age, in years. - int32 age = 1; - - // The confidence in [0, 1] range, where near 0 means 'unsure' and near 1 means 'almost certain'. - float confidence = 2; -} diff --git a/proto/techmo/asr/api/v1/asr.proto b/proto/techmo/asr/api/v1/asr.proto deleted file mode 100644 index a83f414..0000000 --- a/proto/techmo/asr/api/v1/asr.proto +++ /dev/null @@ -1,436 +0,0 @@ -// Copyright 2023 Techmo sp. z o.o. - -syntax = "proto3"; - -package techmo.asr.api.v1; - -import "google/protobuf/duration.proto"; -import "techmo/api/status.proto"; - - -// An automatic speech recognition (ASR) service providing a solution for -// speech-to-text conversion extended by the assessment of additional speech -// and speaker features. -service Asr { - // Perform bidirectional streaming recognition. - rpc StreamingRecognize(stream StreamingRecognizeRequest) - returns (stream StreamingRecognizeResponse) {} -} - -// A message streamed from the client through -// the [`StreamingRecognize`](#StreamingRecognize) method. -message StreamingRecognizeRequest { - oneof request_content { - // The immutable initial configuration of the request. - // Must be sent once in the request's first message. - StreamingRecognizeRequestConfig config = 1; - - // The message controlling the processing flow of the request. - // May be sent multiple times except in the request's first message. - StreamingRecognizeRequestControlMessage control_message = 2; - - // The data contents of the request itself. - // May be sent multiple times except in the request's first message. - StreamingRecognizeRequestData data = 3; - } -} - -// A message holding configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message StreamingRecognizeRequestConfig { - // Part of the configuration for the request's audio content. - AudioConfig audio_config = 1; - - // Part of the configuration for the request's result form. - ResultConfig result_config = 2; - - // Part of the configuration for the request's processing flow. - StreamingConfig streaming_config = 3; - - // Part of the configuration for speech recognition. - SpeechRecognitionConfig speech_recognition_config = 4; - - // Part of the configuration for age recognition. - AgeRecognitionConfig age_recognition_config = 5; - - // Part of the configuration for gender recognition. - GenderRecognitionConfig gender_recognition_config = 6; - - // Part of the configuration for language recognition. - LanguageRecognitionConfig language_recognition_config = 7; -} - -// Result configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message ResultConfig { - // The switch that toggles continuous recognition into single utterance mode. - // The service returns a final result for each end of utterance it detects in - // the audio, which may occur multiple times during a request. - // If enabled, the request terminates right after its first final result. - bool enable_single_utterance = 1; - - // The switch that allows interim results. - // If enabled, results containing tentative hypotheses may be returned in - // addition to final ones. - // The service should silently ignore this field if it is unsupported. - bool enable_interim_results = 2; -} - -// Streaming configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message StreamingConfig { - reserved 1; // bool enable_single_utterance = 1; - - // The switch that enables manual control of the input timer. - // The timer imposes two constraints: one that finalizes recognition after - // a specified period unless speech is detected, and the other that limits - // the total time for an utterance. Manual control allows recognition to - // begin but delays enforcement of these constraints. The timer restarts - // after each detected end of utterance (each final result). - // If enabled, the timer does not start automatically. Instead, it can be - // initiated by sending - // a [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage) - // with the `start_input_timer` field set to `true` as needed. This should - // occur after the beginning of the request and be repeated after each final - // result. - bool enable_manual_input_timer = 2; -} - -// Audio configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message AudioConfig { - // The possible audio encodings. - enum AudioEncoding { - // Unspecified audio encoding. - UNSPECIFIED = 0; - - // Linear pulse-code modulation of uncompressed 16-bit signed little-endian - // samples. - LINEAR16 = 1; - - // Free Lossless Audio Codec ([FLAC](https://wiki.xiph.org/FLAC)). - // The encoding requires only about half the bandwidth of `LINEAR16`. - // 16-bit and 24-bit samples. Not all fields in `STREAMINFO` are supported. - // When set, the service ignores the `sampling_rate_hz` field and detects - // the actual value from audio header instead. - FLAC = 2; - - // Ogg Encapsulated Opus Audio Codec ([OggOpus](https://wiki.xiph.org/OggOpus)). - // When set, the service ignores the `sampling_rate_hz` field and detects - // the actual value from audio header instead. - OGG_OPUS = 6; - - // MP3 (ISO/IEC 11172-3 and ISO/IEC 13818-3). - // Only constant bitrate. - // When set, the service ignores the `sampling_rate_hz` field and detects - // the actual value from audio header instead. - MP3 = 8; - } - - // The encoding of the audio data sent in the request. Single channel (mono) - // audio is assumed. - // The service should respond with the `INVALID_ARGUMENT` gRPC status code - // if the encoding is `UNSPECIFIED`. - // The service should respond with the `FAILED_PRECONDITION` gRPC status code - // if the encoding is not supported. - AudioEncoding encoding = 1; - - // The sampling rate of the audio data sent in the request. - // The service should silently ignore the field for encodings that are sent - // along wtih headers, and detect the value from them instead. - // The service should respond with the `INVALID_ARGUMENT` gRPC status code - // if the value is not greater than 0. - float sampling_rate_hz = 2; -} - -// Configuration of age recognition. -message AgeRecognitionConfig { - // The switch that enables age recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_age_recognition = 1; -} - -// Configuration of gender recognition. -message GenderRecognitionConfig { - // The switch that enables gender recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_gender_recognition = 1; -} - -// Configuration of language recognition. -message LanguageRecognitionConfig { - // The switch that enables language recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_language_recognition = 1; -} - -// Configuration for speech recognition. -message SpeechRecognitionConfig { - // The switch that enables speech recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_speech_recognition = 1; - - // The maximum number of alternative transcriptions allowed to be included - // per response. - // The actual count received can be less than the specified value and may - // also be equal to 0. If unspecified or 0, one alternative is allowed to be - // returned too. - uint32 recognition_alternatives_limit = 2; - - // The switch that enables additional time alignment of recognitions in word - // details. - // If enabled, the `words` field of - // a [`SpeechRecognitionAlternative`](#SpeechRecognitionAlternative) message - // includes a list of [`SpeechRecognitionWord`](#SpeechRecognitionWord) - // messages. Otherwise, it remains empty. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_time_alignment = 3; - - // The name of a language group of models to be used. - // If left unspecified, it backs to the service's default group. - // The service responds with the `NOT_FOUND` gRPC status code - // if the name is not registered. - string language_group_name = 4; - - // The name of a model to be used. - // If left unspecified, it backs to the selected langugage group's default. - // The service responds with the `NOT_FOUND` gRPC status code - // if the name is not registered. - string model_name = 5; - - // Deprecated. - // The additional advanced service-dependend configuration for its speech - // recognizer. It may be silently ignored. - map config_fields = 6; -} - -// A message controlling the processing flow of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message StreamingRecognizeRequestControlMessage { - // The flag that starts the input timer on demand and resets after each final - // result. It is silently ignored if the manual input timer setting is - // disabled for the request. - optional bool start_input_timer = 1; -} - -// A message that carries data contents of -// a [`StreamingRecognizeRequest`](#StreamingRecognizeRequest) request. -message StreamingRecognizeRequestData { - // Part of the audio to perform recognition on. - Audio audio = 1; -} - -// Audio contents. -message Audio { - oneof audio_content { - // The audio data bytes. - bytes bytes = 1; - } -} - -// A message streamed from the service through -// the [`StreamingRecognize`](#StreamingRecognize) method. -message StreamingRecognizeResponse { - // The combined recognition results for another part of the audio. - StreamingRecognizeResult result = 1; - - // The cumulative duration of the processed audio during the request, - // not necessarily matching the actual length of the sent audio, mandatorily - // updated with each final result. - google.protobuf.Duration processed_audio_duration = 2; -} - -// Combined recognition result. -message StreamingRecognizeResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The flag indicating whether the result is interim or final. - bool is_final = 2; - - // The anticipated causes for the service to finalize a result. - enum ResultFinalizationCause { - // The cause is not specified. - UNSPECIFIED = 0; - - // The speech recognition result is not empty and the end of utterance - // is detected. - SUCCESS = 1; - - // The speech recognition result is empty after the duration to expect - // a result is reached. - NO_INPUT_TIMEOUT = 2; - - // The speech recognition result is not empty after the utterance duration - // limit is reached. The returned speech recognition is incomplete and - // should be completed in the following result. - SUCCESS_MAXTIME = 3; - - // Unused. - PARTIAL_MATCH = 4; - - // The speech recognition result is empty after the utterance duration - // limit is reached. - NO_MATCH_MAXTIME = 5; - } - - // The field indicating the cause of result finalization. - // For interim results, the service should leave the field as `UNSPECIFIED`. - // For final results, the service must set the field to a value other than - // `UNSPECIFIED`. - ResultFinalizationCause result_finalization_cause = 3; - - // The speech recognition result for another part of the processed audio, - // new with each final result, updates with each interim one. - // To obtain a complete result for all processed audio, for each final result - // received, a client should pick one of the result's recognition alternatives - // and buffer it on its own. - // It must be omitted if speech recognition is disabled. - SpeechRecognitionResult speech_recognition_result = 4; - - // The current age recognition result for all processed audio, - // updated with each final result. - // It may be omitted in an interim result and must be omitted if age - // recognition is disabled. - AgeRecognitionResult age_recognition_result = 5; - - // The current gender recognition result for all processed audio, - // updated with each final result. - // It may be omitted in an interim result and must be omitted if gender - // recognition is disabled. - GenderRecognitionResult gender_recognition_result = 6; - - // The current language recognition result for all processed audio, - // updated with each final result. - // It may be omitted in an interim result and must be omitted if language - // recognition is disabled. - LanguageRecognitionResult language_recognition_result = 7; -} - -// A result of age recognition. -message AgeRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated AgeRecognitionAlternative recognition_alternatives = 2; -} - -// An alternative hypothesis of age recognition. -message AgeRecognitionAlternative { - // The assumed age of the person speaking in the audio, in years. - // For a reliable value, assure that there is only one person speaking in - // the audio. - uint32 age = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; -} - -// A result of gender recognition. -message GenderRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated GenderRecognitionAlternative recognition_alternatives = 2; -} - -// An alternative hypothesis of gender recognition. -message GenderRecognitionAlternative { - // The assumed gender of the person speaking in the audio. - // For a reliable value, assure that there is only one person speaking in - // the audio. - string gender = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; -} - -// A result of language recognition. -message LanguageRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated LanguageRecognitionAlternative recognition_alternatives = 2; -} - -// An alternative hypothesis of language recognition. -message LanguageRecognitionAlternative { - // The language spoken in the audio, - // a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) tag. - string language = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; -} - -// A result of speech recognition. -message SpeechRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated SpeechRecognitionAlternative recognition_alternatives = 2; - - // The actual name of the language group of the model, - // unrelated to the actual language spoken in the audio. - string language_group_name = 3; - - // The actual name of the model used to obtain the result. - string model_name = 4; -} - -// An alternative hypothesis of speech recognition. -message SpeechRecognitionAlternative { - // The transcript of the audio. - string transcript = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; - - // The details of the transcript's words. - // Empty unless `enable_time_alignment` is `true` in the request's - // [`SpeechRecognitionConfig`](#SpeechRecognitionConfig). - repeated SpeechRecognitionWord words = 3; -} - -// Details of a single word in speech recognition. -message SpeechRecognitionWord { - // The transcript of the word itself. - string transcript = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; - - // The start time of the word relative to the beginning of the entire audio. - google.protobuf.Duration start_time = 3; - - // The end time of the word relative to the beginning of the entire audio. - google.protobuf.Duration end_time = 4; -} diff --git a/proto/techmo/asr/api/v1p1/asr.proto b/proto/techmo/asr/api/v1p1/asr.proto deleted file mode 100644 index f2af15a..0000000 --- a/proto/techmo/asr/api/v1p1/asr.proto +++ /dev/null @@ -1,481 +0,0 @@ -// Copyright 2023 Techmo sp. z o.o. - -syntax = "proto3"; - -package techmo.asr.api.v1p1; - -import "google/protobuf/duration.proto"; -import "techmo/api/status.proto"; - - -// An automatic speech recognition (ASR) service providing a solution for -// speech-to-text conversion extended by the assessment of additional speech -// and speaker features. -service Asr { - // Perform bidirectional streaming recognition. - rpc StreamingRecognize(stream StreamingRecognizeRequest) - returns (stream StreamingRecognizeResponse) {} -} - -// A message streamed from the client through -// the [`StreamingRecognize`](#StreamingRecognize) method. -message StreamingRecognizeRequest { - oneof request_content { - // The immutable initial configuration of the request. - // Must be sent once in the request's first message. - StreamingRecognizeRequestConfig config = 1; - - // The message controlling the processing flow of the request. - // May be sent multiple times except in the request's first message. - StreamingRecognizeRequestControlMessage control_message = 2; - - // The data contents of the request itself. - // May be sent multiple times except in the request's first message. - StreamingRecognizeRequestData data = 3; - } -} - -// A message holding configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message StreamingRecognizeRequestConfig { - // Part of the configuration for the request's audio content. - AudioConfig audio_config = 1; - - // Part of the configuration for the request's result form. - ResultConfig result_config = 2; - - // Part of the configuration for the request's processing flow. - StreamingConfig streaming_config = 3; - - // Part of the configuration for speech recognition. - SpeechRecognitionConfig speech_recognition_config = 4; - - // Part of the configuration for age recognition. - AgeRecognitionConfig age_recognition_config = 5; - - // Part of the configuration for gender recognition. - GenderRecognitionConfig gender_recognition_config = 6; - - // Part of the configuration for language recognition. - LanguageRecognitionConfig language_recognition_config = 7; -} - -// Result configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message ResultConfig { - // The switch that toggles continuous recognition into single utterance mode. - // The service returns a final result for each end of utterance it detects in - // the audio, which may occur multiple times during a request. - // If enabled, the request terminates right after its first final result. - bool enable_single_utterance = 1; - - // The switch that allows interim results. - // If enabled, results containing tentative hypotheses may be returned in - // addition to final ones. - // The service should silently ignore this field if it is unsupported. - bool enable_interim_results = 2; - - // The switch to allow the service merging responses in the "hold response" - // state. - // If enabled and there is more than a single response held, the service does - // not return them in a batch. Instead, it tries to merge their results into - // a single response. - // The service should respond with the `INVALID_ARGUMENT` gRPC status code - // if the `recognition_alternatives_limit` field - // of the [`SpeechRecognitionConfig`](#SpeechRecognitionConfig) message is - // greater than 1. - // New in v1p1. - bool enable_held_responses_merging = 3; -} - -// Streaming configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message StreamingConfig { - reserved 1; // bool enable_single_utterance = 1; - - // The switch that enables manual control of the input timer. - // The timer imposes two constraints: one that finalizes recognition after - // a specified period unless speech is detected, and the other that limits - // the total time for an utterance. Manual control allows recognition to - // begin but delays enforcement of these constraints. The timer restarts - // after each detected end of utterance (each final result). - // If enabled, the timer does not start automatically. Instead, it can be - // initiated by sending - // a [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage) - // with the `start_input_timer` field set to `true` as needed. This should - // occur after the beginning of the request and be repeated after each final - // result. - bool enable_manual_input_timer = 2; - - // The switch to automatically set the service in the "hold response" state - // at the beginning of the request and after each final result. - // The "hold response" state means that the internal recognition process - // continues, but results are kept, not returned. When needed, the state can - // be toggled into the "give response" state by sending - // the [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage) - // message with the `give_response` field set to `true`. - // In the "give response" state the service responds as soon as it is ready. - // Any held responses may be returned in a batch or as a single merged - // response, provided that the `enable_held_responses_merging` field - // of the [`ResultConfig`](#ResultConfig) message is set to `true`. - // New in v1p1. - bool enable_auto_hold_response = 3; -} - -// Audio configuration of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message AudioConfig { - // The possible audio encodings. - enum AudioEncoding { - // Unspecified audio encoding. - UNSPECIFIED = 0; - - // Linear pulse-code modulation of uncompressed 16-bit signed little-endian - // samples. - LINEAR16 = 1; - - // Free Lossless Audio Codec ([FLAC](https://wiki.xiph.org/FLAC)). - // The encoding requires only about half the bandwidth of `LINEAR16`. - // 16-bit and 24-bit samples. Not all fields in `STREAMINFO` are supported. - // When set, the service ignores the `sampling_rate_hz` field and detects - // the actual value from audio header instead. - FLAC = 2; - - // Ogg Encapsulated Opus Audio Codec ([OggOpus](https://wiki.xiph.org/OggOpus)). - // When set, the service ignores the `sampling_rate_hz` field and detects - // the actual value from audio header instead. - OGG_OPUS = 6; - - // MP3 (ISO/IEC 11172-3 and ISO/IEC 13818-3). - // Only constant bitrate. - // When set, the service ignores the `sampling_rate_hz` field and detects - // the actual value from audio header instead. - MP3 = 8; - } - - // The encoding of the audio data sent in the request. Single channel (mono) - // audio is assumed. - // The service should respond with the `INVALID_ARGUMENT` gRPC status code - // if the encoding is `UNSPECIFIED`. - // The service should respond with the `FAILED_PRECONDITION` gRPC status code - // if the encoding is not supported. - AudioEncoding encoding = 1; - - // The sampling rate of the audio data sent in the request. - // The service should silently ignore the field for encodings that are sent - // along wtih headers, and detect the value from them instead. - // The service should respond with the `INVALID_ARGUMENT` gRPC status code - // if the value is not greater than 0. - float sampling_rate_hz = 2; -} - -// Configuration of age recognition. -message AgeRecognitionConfig { - // The switch that enables age recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_age_recognition = 1; -} - -// Configuration of gender recognition. -message GenderRecognitionConfig { - // The switch that enables gender recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_gender_recognition = 1; -} - -// Configuration of language recognition. -message LanguageRecognitionConfig { - // The switch that enables language recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_language_recognition = 1; -} - -// Configuration for speech recognition. -message SpeechRecognitionConfig { - // The switch that enables speech recognition for the request. - // If disabled or unspecified, the related results are excluded. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_speech_recognition = 1; - - // The maximum number of alternative transcriptions allowed to be included - // per response. - // The actual count received can be less than the specified value and may - // also be equal to 0. If unspecified or 0, one alternative is allowed to be - // returned too. - uint32 recognition_alternatives_limit = 2; - - // The switch that enables additional time alignment of recognitions in word - // details. - // If enabled, the `words` field of - // a [`SpeechRecognitionAlternative`](#SpeechRecognitionAlternative) message - // includes a list of [`SpeechRecognitionWord`](#SpeechRecognitionWord) - // messages. Otherwise, it remains empty. - // The service responds with the `FAILED_PRECONDITION` gRPC status code - // if requested but not enabled. - bool enable_time_alignment = 3; - - // The name of a language group of models to be used. - // If left unspecified, it backs to the service's default group. - // The service responds with the `NOT_FOUND` gRPC status code - // if the name is not registered. - string language_group_name = 4; - - // The name of a model to be used. - // If left unspecified, it backs to the selected langugage group's default. - // The service responds with the `NOT_FOUND` gRPC status code - // if the name is not registered. - string model_name = 5; - - // Deprecated. - // The additional advanced service-dependend configuration for its speech - // recognizer. It may be silently ignored. - map config_fields = 6; -} - -// A message controlling the processing flow of -// a [`StreamingRecognize`](#StreamingRecognize) request. -message StreamingRecognizeRequestControlMessage { - reserved 2; - - oneof control_message_content - { - // The flag that starts the input timer on demand and resets after each final - // result. It is silently ignored if the manual input timer setting is - // disabled for the request. - bool start_input_timer = 1; - - // The flag to allow the service to return a response. - // After receiving this message, the service remains in the "give response" - // state. Ignored when the service is already in the "give response" state. - // Mutually exclusive with the `hold_response` field. - // New in v1p1. - bool give_response = 3; - - // The flag to forbid the service from returning a response. - // After receiving this message, the service remains in the "hold response" - // state. Ignored when the service is already in the "hold response" state. - // Mutually exclusive with the `give_response` field. - // New in v1p1. - bool hold_response = 4; - } -} - -// A message that carries data contents of -// a [`StreamingRecognizeRequest`](#StreamingRecognizeRequest) request. -message StreamingRecognizeRequestData { - // Part of the audio to perform recognition on. - Audio audio = 1; -} - -// Audio contents. -message Audio { - oneof audio_content { - // The audio data bytes. - bytes bytes = 1; - } -} - -// A message streamed from the service through -// the [`StreamingRecognize`](#StreamingRecognize) method. -message StreamingRecognizeResponse { - // The combined recognition results for another part of the audio. - StreamingRecognizeResult result = 1; - - // The cumulative duration of the processed audio during the request, - // not necessarily matching the actual length of the sent audio, mandatorily - // updated with each final result. - google.protobuf.Duration processed_audio_duration = 2; -} - -// Combined recognition result. -message StreamingRecognizeResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The flag indicating whether the result is interim or final. - bool is_final = 2; - - // The anticipated causes for the service to finalize a result. - enum ResultFinalizationCause { - // The cause is not specified. - UNSPECIFIED = 0; - - // The speech recognition result is not empty and the end of utterance - // is detected. - SUCCESS = 1; - - // The speech recognition result is empty after the duration to expect - // a result is reached. - NO_INPUT_TIMEOUT = 2; - - // The speech recognition result is not empty after the utterance duration - // limit is reached. The returned speech recognition is incomplete and - // should be completed in the following result. - SUCCESS_MAXTIME = 3; - - // Unused. - PARTIAL_MATCH = 4; - - // The speech recognition result is empty after the utterance duration - // limit is reached. - NO_MATCH_MAXTIME = 5; - } - - // The field indicating the cause of result finalization. - // For interim results, the service should leave the field as `UNSPECIFIED`. - // For final results, the service must set the field to a value other than - // `UNSPECIFIED`. - ResultFinalizationCause result_finalization_cause = 3; - - // The speech recognition result for another part of the processed audio, - // new with each final result, updates with each interim one. - // To obtain a complete result for all processed audio, for each final result - // received, a client should pick one of the result's recognition alternatives - // and buffer it on its own. - // It must be omitted if speech recognition is disabled. - SpeechRecognitionResult speech_recognition_result = 4; - - // The current age recognition result for all processed audio, - // updated with each final result. - // It may be omitted in an interim result and must be omitted if age - // recognition is disabled. - AgeRecognitionResult age_recognition_result = 5; - - // The current gender recognition result for all processed audio, - // updated with each final result. - // It may be omitted in an interim result and must be omitted if gender - // recognition is disabled. - GenderRecognitionResult gender_recognition_result = 6; - - // The current language recognition result for all processed audio, - // updated with each final result. - // It may be omitted in an interim result and must be omitted if language - // recognition is disabled. - LanguageRecognitionResult language_recognition_result = 7; -} - -// A result of age recognition. -message AgeRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated AgeRecognitionAlternative recognition_alternatives = 2; -} - -// An alternative hypothesis of age recognition. -message AgeRecognitionAlternative { - // The assumed age of the person speaking in the audio, in years. - // For a reliable value, assure that there is only one person speaking in - // the audio. - uint32 age = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; -} - -// A result of gender recognition. -message GenderRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated GenderRecognitionAlternative recognition_alternatives = 2; -} - -// An alternative hypothesis of gender recognition. -message GenderRecognitionAlternative { - // The assumed gender of the person speaking in the audio. - // For a reliable value, assure that there is only one person speaking in - // the audio. - string gender = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; -} - -// A result of language recognition. -message LanguageRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated LanguageRecognitionAlternative recognition_alternatives = 2; -} - -// An alternative hypothesis of language recognition. -message LanguageRecognitionAlternative { - // The language spoken in the audio, - // a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) tag. - string language = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; -} - -// A result of speech recognition. -message SpeechRecognitionResult { - // The recognition process status. - // It may communicate warnings. In case of an error hindering recognition, - // all other message fields should be left unset. - techmo.api.Status error = 1; - - // The confidence-ordered list of alternative recognition hypotheses. - repeated SpeechRecognitionAlternative recognition_alternatives = 2; - - // The actual name of the language group of the model, - // unrelated to the actual language spoken in the audio. - string language_group_name = 3; - - // The actual name of the model used to obtain the result. - string model_name = 4; -} - -// An alternative hypothesis of speech recognition. -message SpeechRecognitionAlternative { - // The transcript of the audio. - string transcript = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; - - // The details of the transcript's words. - // Empty unless `enable_time_alignment` is `true` in the request's - // [`SpeechRecognitionConfig`](#SpeechRecognitionConfig). - repeated SpeechRecognitionWord words = 3; -} - -// Details of a single word in speech recognition. -message SpeechRecognitionWord { - // The transcript of the word itself. - string transcript = 1; - - // The confidence estimate, ranging from 0.0 to 1.0. - // Support for this feature is optional. - optional float confidence = 2; - - // The start time of the word relative to the beginning of the entire audio. - google.protobuf.Duration start_time = 3; - - // The end time of the word relative to the beginning of the entire audio. - google.protobuf.Duration end_time = 4; -} diff --git a/pyproject.toml b/pyproject.toml index 873297f..3e5c133 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,23 @@ [build-system] -requires = ["grpcio-tools>=1.49.4,<1.63", "setuptools>=61"] +requires = ["grpcio-tools>=1.49.4,<1.71.0", "setuptools>=61"] build-backend = "setuptools.build_meta" [project] name = "techmo-asr-api" -description = "Techmo ASR API (public)" +description = "Techmo ASR API" dynamic = ["version"] readme = { file = "README.md", content-type = "text/markdown" } authors = [{ name = "Techmo sp. z o.o", email = "kontakt@techmo.pl" }] requires-python = ">=3.8" -dependencies = ["grpcio>=1.49.4,<1.63", "protobuf>=4.21.3,<5"] +dependencies = [ + "grpcio>=1.49.4,<1.71.0; python_version=='3.8'", + "grpcio>=1.49.4; python_version>='3.9'", + "protobuf>=4.21.3,<6.0.0; python_version=='3.8'", + "protobuf>=4.21.3; python_version>='3.9'", +] + +[project.optional-dependencies] +tests = ["pytest<8,>=7.4.4", "pytest-cov>=4.1", "pytest-lazy-fixture>=0.6.3"] [project.urls] repository = "https://github.com/techmo-pl/asr-api-python" @@ -19,3 +27,28 @@ version = { attr = "asr_api.VERSION.__version__" } [tool.setuptools.packages.find] include = ["asr_api*", "google*", "techmo*"] + +[tool.pytest.ini_options] +addopts = ["--strict-markers"] +markers = ["""api(name): mark tests as defined for API. \ + Example: api('techmo.asr.api.v1p1'). \ + Use the `--api=` option to collect the marked tests."""] +testpaths = ["tests"] + +[tool.coverage.report] +precision = 1 +show_missing = true + +[tool.mypy] + +[[tool.mypy.overrides]] +module = ["techmo.*", "google.*"] +ignore_errors = true + +[[tool.mypy.overrides]] +module = "tests.*" +disallow_untyped_decorators = false + +[tool.coverage.run] +source_pkgs = ["asr_api"] +relative_files = true diff --git a/setup.py b/setup.py index 196ac3d..2ca8195 100644 --- a/setup.py +++ b/setup.py @@ -1,71 +1,90 @@ +import os from pathlib import Path -from typing import List +from typing import Any, Optional, Sequence, Union import setuptools +_PathLike = Union[str, bytes, "os.PathLike[Any]"] +_PathLikes = Sequence[_PathLike] -def protoc(args: List[str]): - import pkg_resources + +def _update_submodule( + submodule_path: _PathLike, + git_submodule_update_options: Sequence[str] = ("--init", "--depth", "1", "--"), + working_directory_path: Optional[_PathLike] = None, +) -> None: + import subprocess + + if (Path(str(working_directory_path) if working_directory_path else ".") / str(submodule_path) / ".git").exists(): + return + + if ( + subprocess.call( + command := (("git", "submodule", "update") + tuple(git_submodule_update_options) + (str(submodule_path),)), + cwd=working_directory_path, + ) + != 0 + ): + raise Exception(f"error: {command} failed") + + +def _protoc(*args: str) -> None: + import grpc_tools from grpc_tools import protoc - command = [ - "grpc_tools.protoc", - "--proto_path={}".format( - Path(pkg_resources.resource_filename("grpc_tools", "_proto")) - ), - ] + args - - if protoc.main(command) != 0: - raise Exception("error: {} failed".format(command)) - - -def build_package_grpc_protos( - protos_paths: List[Path], import_directory_paths: List[Path] = [] -): - protoc( - [ - "--proto_path={}".format(Path(import_directory_path)) - for import_directory_path in import_directory_paths - ] - + ["--grpc_python_out=."] - + protos_paths, + if ( + protoc.main( + command := ( + "grpc_tools.protoc", + "--proto_path={}".format(Path(grpc_tools.__file__).parent / "_proto"), + ) + + args + ) + != 0 + ): + raise Exception(f"error: {command} failed") + + +def _build_package_grpc_protos( + proto_paths: _PathLikes, + import_directory_paths: Optional[_PathLikes] = None, +) -> None: + _protoc( + *(f"--proto_path={str(import_directory_path)}" for import_directory_path in import_directory_paths or ()), + "--grpc_python_out=.", + *(str(proto_path) for proto_path in proto_paths), ) -def build_package_protos( - protos_paths: List[Path], import_directory_paths: List[Path] = [] -): - protoc( - [ - "--proto_path={}".format(Path(import_directory_path)) - for import_directory_path in import_directory_paths - ] - + ["--python_out=."] - + protos_paths, +def _build_package_protos( + proto_paths: _PathLikes, + import_directory_paths: Optional[_PathLikes] = None, +) -> None: + _protoc( + *(f"--proto_path={str(import_directory_path)}" for import_directory_path in import_directory_paths or ()), + "--python_out=.", + *(str(proto_path) for proto_path in proto_paths), ) -build_package_protos( - protos_paths=[ - "./proto/google/rpc/status.proto", - "./proto/techmo/api/status.proto", - "./proto/techmo/asr/api/dictation/asr.proto", - "./proto/techmo/asr/api/v1/asr.proto", - "./proto/techmo/asr/api/v1p1/asr.proto", - ], - import_directory_paths=[ - "./proto", - ], +_update_submodule("./submodules/asr-api") +_build_package_protos( + ( + "./submodules/asr-api/proto/google/rpc/status.proto", + "./submodules/asr-api/proto/techmo/api/status.proto", + "./submodules/asr-api/proto/techmo/asr/api/dictation/asr.proto", + "./submodules/asr-api/proto/techmo/asr/api/v1/asr.proto", + "./submodules/asr-api/proto/techmo/asr/api/v1p1/asr.proto", + ), + import_directory_paths=("./submodules/asr-api/proto",), ) -build_package_grpc_protos( - protos_paths=[ - "./proto/techmo/asr/api/dictation/asr.proto", - "./proto/techmo/asr/api/v1/asr.proto", - "./proto/techmo/asr/api/v1p1/asr.proto", - ], - import_directory_paths=[ - "./proto", - ], +_build_package_grpc_protos( + ( + "./submodules/asr-api/proto/techmo/asr/api/dictation/asr.proto", + "./submodules/asr-api/proto/techmo/asr/api/v1/asr.proto", + "./submodules/asr-api/proto/techmo/asr/api/v1p1/asr.proto", + ), + import_directory_paths=("./submodules/asr-api/proto",), ) setuptools.setup() diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..f319aa5 --- /dev/null +++ b/setup.sh @@ -0,0 +1,10 @@ +#!/bin/bash +# +# usage: ./setup.sh +# +# Run once after cloning: initialises git submodules. + +set -euo pipefail + +git submodule sync --recursive +git submodule update --init --recursive diff --git a/submodules/asr-api b/submodules/asr-api new file mode 160000 index 0000000..084c836 --- /dev/null +++ b/submodules/asr-api @@ -0,0 +1 @@ +Subproject commit 084c836bff448aff140dd2391499a297aacabc4f diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..7650569 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,22 @@ +from typing import List + +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + parser.addoption( + "--api", + default=None, + choices=[ + "techmo.asr.api.dictation", + "techmo.asr.api.v1", + "techmo.asr.api.v1p1", + ], + help="the argument of tests marked with the `@pytest.mark.api(name)` marker to be collected; one of: %(choices)s (default is %(default)r)", + metavar="name", + ) + + +def pytest_collection_modifyitems(config: pytest.Config, items: List[pytest.Item]) -> None: + if api := config.getoption("--api"): + items[:] = (item for item in items if (mark := item.get_closest_marker("api")) and mark.args and mark.args[0] == api) diff --git a/tests/test_asr_api.py b/tests/test_asr_api.py new file mode 100644 index 0000000..2dcf141 --- /dev/null +++ b/tests/test_asr_api.py @@ -0,0 +1,140 @@ +from typing import cast + +import pytest + + +@pytest.fixture +def asr_api_dictation() -> object: + import asr_api.dictation + + return asr_api.dictation + + +@pytest.fixture( + params=( + "Speech", + "RecognizeRequest", + "StreamingRecognizeRequest", + "StreamingRecognitionConfig", + "RecognitionConfig", + "SpeechDurationConfig", + "SpeechDurationThresholdMode", + "SpeechContext", + "ConfigField", + "RecognitionAudio", + "RecognizeResponse", + "StreamingRecognizeResponse", + "StreamingRecognitionResult", + "SpeechRecognitionResult", + "SpeechRecognitionAlternative", + "WordInfo", + "RecognitionLattice", + "LatticeEdge", + "Gender", + "Age", + ), +) +def asr_api_dictation_attr(request: pytest.FixtureRequest) -> str: + return cast(str, request.param) + + +@pytest.fixture +def asr_api_v1() -> object: + import asr_api.v1 + + return asr_api.v1 + + +@pytest.fixture( + params=( + "Asr", + "StreamingRecognizeRequest", + "StreamingRecognizeRequestConfig", + "ResultConfig", + "StreamingConfig", + "AudioConfig", + "AgeRecognitionConfig", + "GenderRecognitionConfig", + "LanguageRecognitionConfig", + "SpeechRecognitionConfig", + "StreamingRecognizeRequestControlMessage", + "StreamingRecognizeRequestData", + "Audio", + "StreamingRecognizeResponse", + "StreamingRecognizeResult", + "AgeRecognitionResult", + "AgeRecognitionAlternative", + "GenderRecognitionResult", + "GenderRecognitionAlternative", + "LanguageRecognitionResult", + "LanguageRecognitionAlternative", + "SpeechRecognitionResult", + "SpeechRecognitionAlternative", + "SpeechRecognitionWord", + ), +) +def asr_api_v1_attr(request: pytest.FixtureRequest) -> str: + return cast(str, request.param) + + +@pytest.fixture +def asr_api_v1p1() -> object: + import asr_api.v1p1 + + return asr_api.v1p1 + + +@pytest.fixture( + params=( + "Asr", + "StreamingRecognizeRequest", + "StreamingRecognizeRequestConfig", + "ResultConfig", + "StreamingConfig", + "AudioConfig", + "AgeRecognitionConfig", + "GenderRecognitionConfig", + "LanguageRecognitionConfig", + "SpeechRecognitionConfig", + "StreamingRecognizeRequestControlMessage", + "StreamingRecognizeRequestData", + "Audio", + "StreamingRecognizeResponse", + "StreamingRecognizeResult", + "AgeRecognitionResult", + "AgeRecognitionAlternative", + "GenderRecognitionResult", + "GenderRecognitionAlternative", + "LanguageRecognitionResult", + "LanguageRecognitionAlternative", + "SpeechRecognitionResult", + "SpeechRecognitionAlternative", + "SpeechRecognitionWord", + ), +) +def asr_api_v1p1_attr(request: pytest.FixtureRequest) -> str: + return cast(str, request.param) + + +@pytest.mark.parametrize( + "api, attr", + ( + pytest.param( + pytest.lazy_fixture("asr_api_dictation"), + pytest.lazy_fixture("asr_api_dictation_attr"), + marks=pytest.mark.api("techmo.asr.api.dictation"), + ), + pytest.param( + pytest.lazy_fixture("asr_api_v1"), + pytest.lazy_fixture("asr_api_v1_attr"), + marks=pytest.mark.api("techmo.asr.api.v1"), + ), + pytest.param( + pytest.lazy_fixture("asr_api_v1p1"), + pytest.lazy_fixture("asr_api_v1p1_attr"), + marks=pytest.mark.api("techmo.asr.api.v1p1"), + ), + ), +) +def test_hasattr(api: object, attr: str) -> None: + assert hasattr(api, attr) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..ce99428 --- /dev/null +++ b/tox.ini @@ -0,0 +1,36 @@ +[tox] +# Python 3.7 is not downloadable via uv; minimum testable version is 3.8. +# Python 3.14 is included to catch forward-compat issues early. +envlist = py38, py39, py310, py311, py312, py313, py314 +requires = + tox-uv>=1 + +[testenv] +# skip_install keeps each env lightweight: the package is found via PYTHONPATH +# rather than doing a full editable install in each env. +# Proto stubs (techmo/asr/api/*_pb2.py) are gitignored — they are generated +# by setup.py at build time. In CI the before_script runs ./install.sh first; +# locally, run `./install.sh` first (requires grpcio-tools and the asr-api +# submodule). +skip_install = true +set_env = PYTHONPATH = {toxinidir} +# Pass service-address variables so integration tests can connect to a live service +# when run via tox (e.g. tox -e py312 -- -m integration). +passenv = + ASR_* +deps = + # grpcio 1.71.0 dropped Python 3.8 + grpcio>=1.49.4,<1.71.0; python_version=="3.8" + grpcio>=1.49.4; python_version>="3.9" + protobuf>=4.21.3,<6.0.0; python_version=="3.8" + protobuf>=4.21.3; python_version>="3.9" + pytest>=7.4.4,<8 + pytest-cov>=4.1 + pytest-lazy-fixture>=0.6.3 +commands_pre = + # Abort early with a clear message if proto stubs are missing rather than + # letting pytest fail with a cryptic ImportError deep in imports. + # PYTHONPATH already contains {toxinidir} so we use it to locate the stub. + python -c "import os, sys; stub = os.path.join(os.environ['PYTHONPATH'], 'techmo', 'asr', 'api', 'dictation', 'asr_pb2.py'); sys.exit(0) if os.path.exists(stub) else sys.exit('Proto stubs missing. Run: ./install.sh')" +commands = + pytest --color=yes --cov=asr_api --cov-report=term-missing --cov-report=xml:{envtmpdir}/coverage.xml {posargs}