diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..48dadd7
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,55 @@
+name: Tests
+
+on:
+ push:
+ branches: ["main", "master"]
+ pull_request:
+ workflow_dispatch:
+
+jobs:
+ test:
+ name: Python ${{ matrix.python-version }}
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: recursive
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
+ with:
+ cache-dependency-glob: "pyproject.toml"
+
+ - name: Generate proto stubs and install package
+ run: ./install.sh
+
+ - name: Run tests via tox
+ run: |
+ TOXENV="py$(echo '${{ matrix.python-version }}' | tr -d '.')"
+ uvx --with "tox-uv>=1" tox -e "${TOXENV}"
+
+ test-py314:
+ name: Python 3.14 (allowed failure)
+ runs-on: ubuntu-latest
+ continue-on-error: true
+
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ submodules: recursive
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v5
+ with:
+ cache-dependency-glob: "pyproject.toml"
+
+ - name: Generate proto stubs and install package
+ run: ./install.sh
+
+ - name: Run tests via tox
+ run: uvx --with "tox-uv>=1" tox -e py314
diff --git a/.gitignore b/.gitignore
index ccdbf67..6262d55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,10 @@
build/
*.egg-info/
.eggs/
+.tox/
google/
__pycache__/
techmo/
.venv/
.vscode/
+pre-commit/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..20f982f
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "submodules/asr-api"]
+ path = submodules/asr-api
+ url = https://github.com/techmo-pl/asr-api.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c71927..d75ed34 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,28 @@
# Changelog of ASR API (Python)
-## [1.0.0] - 2024-08-14
+
+## [1.1.4] - 2026-03-22
+
+### Fixed
+
+- `install.sh`: added `export PATH="$HOME/.local/bin:$PATH"` so that `uv` is found on runners where it is installed locally rather than system-wide.
+- `VERSION.py`: corrected version string (was not updated when 1.1.3 was tagged).
+
+### Changed
+
+- `setup.py`: replaced `pkg_resources` with importlib-compatible path resolution; removed upper bound on setuptools; removed upper bound on grpcio-tools build requirement.
+- `pyproject.toml`: removed upper bound on grpcio and protobuf runtime requirements; added Python-version markers to guard Python 3.8 users from grpcio>=1.71.0 and protobuf>=6.0.0; grpcio bounds set to `>=1.49.4,<1.71.0` for Python 3.8 and `>=1.49.4` for 3.9+; protobuf bounds set to `>=4.21.3,<6`; `requires-python` lowered to `>=3.8`; introduced upper bound on setuptools below 82; added `pip<26` constraint.
+- `tox.ini`, `install.sh`: introduced uv-based multi-version testing (Python 3.8–3.14); replaced Docker-based single-version test with tox multi-version matrix.
+- `submodules/asr-api`: updated to v1.1.1; restructured from committed proto files to a submodule.
+- `asr_api/`: support for _techmo.asr.api.v1p1_ API.
+- `tests/`: attribute check for _techmo.asr.api.v1p1_ API.
+
+
+## [1.0.0] - 2024-01-29
### Added
-- _asr_api_ package
- - support for _techmo.asr.api.dictation_ API
- - support for _techmo.asr.api.v1_ API
-- Setuptools configuration
+- `asr_api/`: support for _techmo.asr.api.dictation_ and _techmo.asr.api.v1_ APIs.
+- `pyproject.toml`: setuptools configuration.
+- `tests/`: attribute checks for _techmo.asr.api.dictation_ and _techmo.asr.api.v1_ APIs; coverage report.
+- `submodules/asr-api`: asr-api v1.0.0.
diff --git a/README.md b/README.md
index 2b90047..38d64d9 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,16 @@
# ASR API (Python)
-The collection of gRPC APIs for Techmo ASR supplied as a Python package.
+The collection of gRPC APIs for Techmo ASR solutions supplied as a Python package.
## Setup
-The project can be used as-is and does not require any additional setup.
+Run once after cloning to initialise the submodule:
-## Requirements
+```sh
+./setup.sh
+```
+
+### Requirements
- [Python](https://www.python.org/) >=3.8
@@ -23,6 +27,23 @@ pip install --require-virtualenv --upgrade pip
pip install --require-virtualenv .
```
+*For basic development use, consider convenient `./install.sh`.*
+
+## Running tests
+
+Proto stubs must be generated before running tests. Use `./install.sh` once, then invoke tox:
+
+```sh
+./install.sh
+uvx --with "tox-uv>=1" tox
+```
+
+To run a single Python version:
+
+```sh
+uvx --with "tox-uv>=1" tox -e py312
+```
+
## Usage
### Import
@@ -31,6 +52,14 @@ The package provides a precompiled collection of `.proto` files that can be impo
Example:
+- direct import
+
+```python
+>>> from techmo.asr.api.v1p1 import asr_pb2 as api
+>>> hasattr(api, "StreamingRecognizeRequest")
+True
+```
+
- import from an alias module
```python
diff --git a/VERSION.md b/VERSION.md
deleted file mode 100644
index 3eefcb9..0000000
--- a/VERSION.md
+++ /dev/null
@@ -1 +0,0 @@
-1.0.0
diff --git a/asr_api/VERSION.py b/asr_api/VERSION.py
index 5becc17..c72e379 100644
--- a/asr_api/VERSION.py
+++ b/asr_api/VERSION.py
@@ -1 +1 @@
-__version__ = "1.0.0"
+__version__ = "1.1.4"
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..490190a
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# usage: ./install.sh [VENV_PATH]
+#
+# VENV_PATH: Optional path for the virtual environment (default: ./.venv).
+#
+# Creates a virtualenv with uv and installs the package with test dependencies.
+
+set -euo pipefail
+
+VENV_PATH="${1:-.venv}"
+
+if [ ! -d "${VENV_PATH}" ]; then
+ uv venv "${VENV_PATH}"
+fi
+
+# shellcheck disable=SC1091
+source "${VENV_PATH}/bin/activate"
+uv pip install -e ".[tests]"
diff --git a/proto/google/rpc/status.proto b/proto/google/rpc/status.proto
deleted file mode 100644
index 923e169..0000000
--- a/proto/google/rpc/status.proto
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto3";
-
-package google.rpc;
-
-import "google/protobuf/any.proto";
-
-option cc_enable_arenas = true;
-option go_package = "google.golang.org/genproto/googleapis/rpc/status;status";
-option java_multiple_files = true;
-option java_outer_classname = "StatusProto";
-option java_package = "com.google.rpc";
-option objc_class_prefix = "RPC";
-
-// The `Status` type defines a logical error model that is suitable for
-// different programming environments, including REST APIs and RPC APIs. It is
-// used by [gRPC](https://github.com/grpc). Each `Status` message contains
-// three pieces of data: error code, error message, and error details.
-//
-// You can find out more about this error model and how to work with it in the
-// [API Design Guide](https://cloud.google.com/apis/design/errors).
-message Status {
- // The status code, which should be an enum value of
- // [google.rpc.Code][google.rpc.Code].
- int32 code = 1;
-
- // A developer-facing error message, which should be in English. Any
- // user-facing error message should be localized and sent in the
- // [google.rpc.Status.details][google.rpc.Status.details] field, or localized
- // by the client.
- string message = 2;
-
- // A list of messages that carry the error details. There is a common set of
- // message types for APIs to use.
- repeated google.protobuf.Any details = 3;
-}
diff --git a/proto/techmo/api/status.proto b/proto/techmo/api/status.proto
deleted file mode 100644
index 71a21ca..0000000
--- a/proto/techmo/api/status.proto
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2023 Techmo sp. z o.o.
-
-syntax = "proto3";
-
-package techmo.api;
-
-import "google/protobuf/any.proto";
-
-
-//
-message Status {
- //
- int32 code = 1;
-
- //
- string message = 2;
-
- //
- repeated google.protobuf.Any details = 3;
-}
diff --git a/proto/techmo/asr/api/dictation/asr.proto b/proto/techmo/asr/api/dictation/asr.proto
deleted file mode 100644
index 2380707..0000000
--- a/proto/techmo/asr/api/dictation/asr.proto
+++ /dev/null
@@ -1,782 +0,0 @@
-// Copyright 2018 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Modified by Techmo, copyright by Google. Changes include:
-// 1. Additions that introduce new features to the original API. Extensions
-// (parts that were added to the original document) by Techmo are marked with
-// [**Extension by Techmo**] tag.
-// - `MP3` audio encoding type.
-// - `ConfigField` as means to provide additional configuration.
-// - `ResultFinalizationCause` as means to indicate MRCPv2-related recognition
-// result finalization cause.
-// - `RecognitionLattice` and `LatticeEdge` as means to return detailed
-// recognition results.
-// - `Age` and `Gender` as means to provide age and gender recognition results
-// in `SpeechRecognitionResult` and `StreamingRecognitionResult`.
-// 2. Modifications of comments, according to how recognition is performed by Techmo.
-// - [*Unused*] tags for fields or values that are not used (ignored when
-// provided in request, never returned in response).
-// - [*Unsupported*] tags for fields or values that will result in an error
-// when provided in request.
-// 3. Removal of `LongRunningRecognize` support (commented out).
-
-syntax = "proto3";
-
-package google.cloud.speech.v1;
-
-// import "google/api/annotations.proto";
-// import "google/longrunning/operations.proto";
-// import "google/protobuf/any.proto";
-import "google/protobuf/duration.proto";
-// import "google/protobuf/empty.proto";
-// import "google/protobuf/timestamp.proto";
-import "google/rpc/status.proto";
-
-option cc_enable_arenas = true;
-option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech";
-option java_multiple_files = true;
-option java_outer_classname = "SpeechProto";
-option java_package = "com.google.cloud.speech.v1";
-
-
-// Service that implements Google Cloud Speech API extended by Techmo.
-service Speech {
- // Performs synchronous speech recognition: receive results after all audio
- // has been sent and processed.
- rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {
- // option (google.api.http) = {
- // post: "/v1/speech:recognize"
- // body: "*"
- // };
- }
-
- // // Performs asynchronous speech recognition: receive results via the
- // // google.longrunning.Operations interface. Returns either an
- // // `Operation.error` or an `Operation.response` which contains
- // // a `LongRunningRecognizeResponse` message.
- // rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
- // option (google.api.http) = {
- // post: "/v1/speech:longrunningrecognize"
- // body: "*"
- // };
- // }
-
- // Performs bidirectional streaming speech recognition: receive results while
- // sending audio. This method is only available via the gRPC API (not REST).
- rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
- }
-}
-
-// The top-level message sent by the client for the `Recognize` method.
-message RecognizeRequest {
- // [*Required*] Provides information to the recognizer that specifies how to
- // process the request.
- RecognitionConfig config = 1;
-
- // [*Required*] The audio data to be recognized.
- RecognitionAudio audio = 2;
-}
-
-// // The top-level message sent by the client for the `LongRunningRecognize`
-// // method.
-// message LongRunningRecognizeRequest {
-// // [*Required*] Provides information to the recognizer that specifies how to
-// // process the request.
-// RecognitionConfig config = 1;
-
-// // [*Required*] The audio data to be recognized.
-// RecognitionAudio audio = 2;
-// }
-
-// The top-level message sent by the client for the `StreamingRecognize` method.
-// Multiple `StreamingRecognizeRequest` messages are sent. The first message
-// must contain a `streaming_config` message and must not contain `audio` data.
-// All subsequent messages must contain `audio` data and must not contain a
-// `streaming_config` message.
-message StreamingRecognizeRequest {
- // The streaming request, which is either a streaming config or audio content.
- oneof streaming_request {
- // Provides information to the recognizer that specifies how to process the
- // request. The first `StreamingRecognizeRequest` message must contain a
- // `streaming_config` message.
- StreamingRecognitionConfig streaming_config = 1;
-
- // The audio data to be recognized. Sequential chunks of audio data are sent
- // in sequential `StreamingRecognizeRequest` messages. The first
- // `StreamingRecognizeRequest` message must not contain `audio_content` data
- // and all subsequent `StreamingRecognizeRequest` messages must contain
- // `audio_content` data. The audio bytes must be encoded as specified in
- // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
- // pure binary representation (not base64).
- bytes audio_content = 2;
-
- // @exclude See [content limits](/speech-to-text/quotas#content).
-
- // [**Extension by Techmo**]
- // Another experimental feature from MRCPv2.
- // https://www.rfc-editor.org/rfc/rfc6787.html#section-9.13
- bool start_input_timers = 3;
- }
-}
-
-// Provides information to the recognizer that specifies how to process the
-// request.
-message StreamingRecognitionConfig {
- // [*Required*] Provides information to the recognizer that specifies how to
- // process the request.
- RecognitionConfig config = 1;
-
- // [*Optional*] If `false` or omitted, the recognizer will perform continuous
- // recognition (continuing to wait for and process audio even if the user
- // pauses speaking) until the client closes the input stream (gRPC API) or
- // until the maximum time limit has been reached. May return multiple
- // `StreamingRecognitionResult`s with the `is_final` flag set to `true`.
- // If `true`, the recognizer will detect a single spoken utterance. When it
- // detects that the user has paused or stopped speaking, it will return an
- // `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no
- // more than one `StreamingRecognitionResult` with the `is_final` flag set to
- // `true`.
- bool single_utterance = 2;
-
- // [*Optional*] If `true`, interim results (tentative hypotheses) may be
- // returned as they become available (these interim results are indicated with
- // the `is_final=false` flag).
- // If `false` or omitted, only `is_final=true` result(s) are returned.
- bool interim_results = 3;
-
- // [**Extension by Techmo**]
- // Another experimental feature from MRCPv2.
- // https://www.rfc-editor.org/rfc/rfc6787.html#section-9.4.14
- optional bool start_input_timers = 4;
-}
-
-// Provides information to the recognizer that specifies how to process the
-// request.
-message RecognitionConfig {
-
- // @exclude The encoding of the audio data sent in the request.
- //
- // All encodings support only 1 channel (mono) audio.
- //
- // For best results, the audio source should be captured and transmitted using
- // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
- // recognition can be reduced if lossy codecs are used to capture or transmit
- // audio, particularly if background noise is present. Lossy codecs include
- // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`.
- //
- // The `FLAC` and `WAV` audio file formats include a header that describes the
- // included audio content. You can request recognition for `WAV` files that
- // contain either `LINEAR16` or `MULAW` encoded audio.
- // If you send `FLAC` or `WAV` audio file format in
- // your request, you do not need to specify an `AudioEncoding`; the audio
- // encoding format is determined from the file header. If you specify
- // an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
- // encoding configuration must match the encoding described in the audio
- // header; otherwise the request returns an
- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
-
- // The encoding of the audio data sent in the request.
- // All encodings support only 1 channel (mono) audio.
- enum AudioEncoding {
- // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
- ENCODING_UNSPECIFIED = 0;
-
- // Uncompressed 16-bit signed little-endian samples (Linear PCM).
- LINEAR16 = 1;
-
- // `FLAC` (Free Lossless Audio
- // Codec) is the recommended encoding because it is
- // lossless--therefore recognition is not compromised--and
- // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
- // encoding supports 16-bit and 24-bit samples, however, not all fields in
- // `STREAMINFO` are supported.
- // [**Extension by Techmo**] Supported only by `Recognize`. When requested by `StreamingRecognize`, will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead.
- FLAC = 2;
-
- // @exclude 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
-
- // [*Unsupported*]
- MULAW = 3;
-
- // @exclude Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
-
- // [*Unsupported*]
- AMR = 4;
-
- // @exclude Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
-
- // [*Unsupported*]
- AMR_WB = 5;
-
- // Opus encoded audio frames in Ogg container
- // ([OggOpus](https://wiki.xiph.org/OggOpus)).
- // [**Extension by Techmo**] Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead.
- OGG_OPUS = 6;
-
- // @exclude `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
-
- // @exclude Although the use of lossy encodings is not recommended, if a very low
- // bitrate encoding is required, `OGG_OPUS` is highly preferred over
- // Speex encoding. The [Speex](https://speex.org/) encoding supported by
- // Cloud Speech API has a header byte in each block, as in MIME type
- // `audio/x-speex-with-header-byte`.
- // It is a variant of the RTP Speex encoding defined in
- // [RFC 5574](https://tools.ietf.org/html/rfc5574).
- // The stream is a sequence of blocks, one block per RTP packet. Each block
- // starts with a byte containing the length of the block, in bytes, followed
- // by one or more frames of Speex data, padded to an integral number of
- // bytes (octets) as specified in RFC 5574. In other words, each RTP header
- // is replaced with a single byte containing the block length. Only Speex
- // wideband is supported. `sample_rate_hertz` must be 16000.
-
- // [*Unsupported*]
- SPEEX_WITH_HEADER_BYTE = 7;
-
- // [**Extension by Techmo**] `MP3` (standards ISO/IEC 11172-3 and ISO/IEC 13818-3) Only constant bit rate files are accepted. Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead.
- MP3 = 8;
- }
-
- // [*Required*] Encoding of audio data sent in all `RecognitionAudio` messages.
- AudioEncoding encoding = 1;
-
- // @exclude Encoding of audio data sent in all `RecognitionAudio` messages.
- // This field is optional for `FLAC` and `WAV` audio files and required
- // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
-
- // [*Required*] Sample rate in Hertz of the audio data sent in all
- // `RecognitionAudio` messages. Valid values are: 8000-48000.
- // 16000 is optimal. For best results, set the sampling rate of the audio
- // source to 16000 Hz. If that's not possible, use the native sample rate of
- // the audio source (instead of re-sampling).
- // [**Extension by Techmo**] Silently ignored for `FLAC`, `OGG_OPUS` and `MP3` encodings. Real sample rate will be detected from file header instead.
- int32 sample_rate_hertz = 2;
-
- // @exclude This field is optional for `FLAC` and `WAV` audio files and required
- // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
-
- // [*Required*] The language of the supplied audio as a
- // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
- // Example: "en-US".
- // The only language supported at the moment is Polish (`pl-PL`).
- string language_code = 3;
-
- // @exclude See [Language Support](/speech-to-text/docs/languages)
- // for a list of the currently supported language codes.
-
- // [*Optional*] Maximum number of recognition hypotheses to be returned.
- // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
- // within each `SpeechRecognitionResult`.
- // The server may return fewer than `max_alternatives`.
- // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
- // one. If omitted, will return a maximum of one.
- int32 max_alternatives = 4;
-
- // @exclude [*Optional*] If set to `true`, the server will attempt to filter out
- // profanities, replacing all but the initial character in each filtered word
- // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
- // won't be filtered out.
-
- // [*Optional*][*Unused*]
- bool profanity_filter = 5;
-
- // @exclude [*Optional*] array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
- // First element of the array is used to identify context model to be used
- // in current recognition.
-
- // [*Optional*]
- repeated SpeechContext speech_contexts = 6;
-
- // [*Optional*] If `true`, the top result includes a list of words and
- // the start and end time offsets (timestamps) for those words. If
- // `false`, no word-level time offset information is returned. The default is
- // `false`.
- bool enable_word_time_offsets = 8;
-
- // @exclude [*Optional*] If 'true', adds punctuation to recognition result hypotheses.
- // This feature is only available in select languages. Setting this for
- // requests in other languages has no effect at all.
- // The default 'false' value does not add punctuation to result hypotheses.
- // Note: This is currently offered as an experimental service, complimentary
- // to all users. In the future this may be exclusively available as a
- // premium feature.
-
- // [*Optional*][*Unused*]
- bool enable_automatic_punctuation = 11;
-
- // [**Extension by Techmo**]
- // [*Optional*] A means to provide additional configuration fields via request.
- repeated ConfigField config_fields = 12;
-
- // @exclude [*Optional*] Which model to select for the given request. Select the model
- // best suited to your domain to get best results. If a model is not
- // explicitly specified, then we auto-select a model based on the parameters
- // in the RecognitionConfig.
- //
- //
- // | Model |
- // Description |
- //
- //
- // command_and_search |
- // Best for short queries such as voice commands or voice search. |
- //
- //
- // phone_call |
- // Best for audio that originated from a phone call (typically
- // recorded at an 8khz sampling rate). |
- //
- //
- // video |
- // Best for audio that originated from from video or includes multiple
- // speakers. Ideally the audio is recorded at a 16khz or greater
- // sampling rate. This is a premium model that costs more than the
- // standard rate. |
- //
- //
- // default |
- // Best for audio that is not one of the specific audio models.
- // For example, long-form audio. Ideally the audio is high-fidelity,
- // recorded at a 16khz or greater sampling rate. |
- //
- //
-
- // [*Optional*][*Unused*]
- string model = 13;
-
- // @exclude [*Optional*] Set to true to use an enhanced model for speech recognition.
- // You must also set the `model` field to a valid, enhanced model. If
- // `use_enhanced` is set to true and the `model` field is not set, then
- // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
- // version of the specified model does not exist, then the speech is
- // recognized using the standard version of the specified model.
- //
- // Enhanced speech models require that you opt-in to data logging using
- // instructions in the [documentation](/speech-to-text/enable-data-logging).
- // If you set `use_enhanced` to true and you have not enabled audio logging,
- // then you will receive an error.
-
- // [*Optional*][*Unused*]
- bool use_enhanced = 14;
-
- // [**Extension by Techmo**][*Optional*] Gender and age recognition parameters
- SpeechDurationConfig speech_duration_gender_recognition = 15;
-
- // [**Extension by Techmo**][*Optional*] Gender and age recognition parameters
- SpeechDurationConfig speech_duration_age_recognition = 16;
-}
-
-// [**Extension by Techmo**] Gender and age recognition parameters.
-message SpeechDurationConfig {
- // The way in which service decides when to start recognition.
- SpeechDurationThresholdMode speech_duration_threshold_mode = 3;
-
- // The minimum duration of speech in `audio` required to start recognition, in ms.
- // Ignored, unless `speech_duration_threshold_mode` is `CUSTOM`.
- uint32 speech_duration_threshold_ms = 4;
-}
-
-// [**Extension by Techmo**]
-// The possible ways for a service to decide when to start recognition
-// depending on a duration of speech in `audio`.
-enum SpeechDurationThresholdMode {
- // Use an implementation-defined threshold value carefully tuned to obtain best results.
- DEFAULT = 0;
-
- // Use an user-defined threshold value provided in the configuration message.
- CUSTOM = 1;
-
- // Disable early start of recognition and wait for the entire audio data.
- DISABLED = 2;
-}
-
-
-// @exclude Provides "hints" to the speech recognizer to favor specific words and phrases
-// in the results.
-
-message SpeechContext {
- // @exclude [*Optional*] Can be used to send a context phrase that switches the model
- // used during recognition. If the phrase correctly identifies the context model
- // used in service, it will be used instead of the general model for the current recognition.
- // Due to compatibility with Google API, the object is defined as a list of strings,
- // but only the first element of the list is used as the context phrase,
- // the rest are ignored if present.
-
- repeated string phrases = 1;
-}
-
-// [**Extension by Techmo**]
-// Provides a pair of configuration field name and value.
-message ConfigField {
- // Name of configuration field.
- string key = 1;
-
- // Value of configuration field.
- string value = 2;
-}
-
-// @exclude Contains audio data in the encoding specified in the `RecognitionConfig`.
-// Either `content` or `uri` must be supplied. Supplying both or neither
-// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
-// [audio limits](https://cloud.google.com/speech/limits#content).
-
-// Contains audio data in the encoding specified in the `RecognitionConfig`.
-// Only `content` is allowed to be supplied.
-message RecognitionAudio {
-
- // @exclude The audio source, which is either inline content or a Google Cloud
- // Storage uri.
-
- // The audio source, which is inline content.
- oneof audio_source {
- // The audio data bytes encoded as specified in
- // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
- // pure binary representation, whereas JSON representations use base64.
- bytes content = 1;
-
- // @exclude URI that points to a file that contains audio data bytes as specified in
- // `RecognitionConfig`. The file must not be compressed (for example, gzip).
- // Currently, only Google Cloud Storage URIs are
- // supported, which must be specified in the following format:
- // `gs://bucket_name/object_name` (other URI formats return
- // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
- // [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
-
- // [*Unsupported*]
- string uri = 2;
- }
-}
-
-// The only message returned to the client by the `Recognize` method. It
-// contains the result as zero or more sequential `SpeechRecognitionResult`
-// messages.
-message RecognizeResponse {
- // [*Output only*] Sequential list of transcription results corresponding to
- // sequential portions of audio.
- repeated SpeechRecognitionResult results = 2;
-}
-
-// // The only message returned to the client by the `LongRunningRecognize` method.
-// // It contains the result as zero or more sequential `SpeechRecognitionResult`
-// // messages. It is included in the `result.response` field of the `Operation`
-// // returned by the `GetOperation` call of the `google::longrunning::Operations`
-// // service.
-// message LongRunningRecognizeResponse {
-// // [*Output only*] Sequential list of transcription results corresponding to
-// // sequential portions of audio.
-// repeated SpeechRecognitionResult results = 2;
-// }
-
-// // Describes the progress of a long-running `LongRunningRecognize` call. It is
-// // included in the `metadata` field of the `Operation` returned by the
-// // `GetOperation` call of the `google::longrunning::Operations` service.
-// message LongRunningRecognizeMetadata {
-// // Approximate percentage of audio processed thus far. Guaranteed to be 100
-// // when the audio is fully processed and the results are available.
-// int32 progress_percent = 1;
-
-// // Time when the request was received.
-// google.protobuf.Timestamp start_time = 2;
-
-// // Time of the most recent processing update.
-// google.protobuf.Timestamp last_update_time = 3;
-// }
-
-// `StreamingRecognizeResponse` is the only message returned to the client by
-// `StreamingRecognize`. A series of one or more `StreamingRecognizeResponse`
-// messages are streamed back to the client.
-//
-// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
-// be returned while processing audio:
-//
-// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
-//
-// 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
-//
-// 3. results { alternatives { transcript: "to be" } stability: 0.9 }
-// results { alternatives { transcript: " or not to be" } stability: 0.01 }
-//
-// 4. results { alternatives { transcript: "to be or not to be"
-// confidence: 0.92 }
-// alternatives { transcript: "to bee or not to bee" }
-// is_final: true }
-//
-// 5. results { alternatives { transcript: " that's" } stability: 0.01 }
-//
-// 6. results { alternatives { transcript: " that is" } stability: 0.9 }
-// results { alternatives { transcript: " the question" } stability: 0.01 }
-//
-// 7. results { alternatives { transcript: " that is the question"
-// confidence: 0.98 }
-// alternatives { transcript: " that was the question" }
-// is_final: true }
-//
-// Notes:
-//
-// - Only two of the above responses #4 and #7 contain final results; they are
-// indicated by `is_final: true`. Concatenating these together generates the
-// full transcript: "to be or not to be that is the question".
-//
-// - The others contain interim `results`. #3 and #6 contain two interim
-// `results`: the first portion has a high stability and is less likely to
-// change; the second portion has a low stability and is very likely to
-// change. A UI designer might choose to show only high stability `results`.
-//
-// - The specific `stability` and `confidence` values shown above are only for
-// illustrative purposes. Actual values may vary.
-//
-// - In each response, only one of these fields will be set:
-// `error`,
-// `speech_event_type`, or
-// one or more (repeated) `results`.
-message StreamingRecognizeResponse {
- // Indicates the type of speech event.
- enum SpeechEventType {
- // No speech event specified.
- SPEECH_EVENT_UNSPECIFIED = 0;
-
- // This event indicates that the server has detected the end of the user's
- // speech utterance and expects no additional speech. Therefore, the server
- // will not process additional audio (although it may subsequently return
- // additional results). The client should stop sending additional audio
- // data, half-close the gRPC connection, and wait for any additional results
- // until the server closes the gRPC connection. This event is only sent if
- // `single_utterance` was set to `true`, and is not used otherwise.
- END_OF_SINGLE_UTTERANCE = 1;
- }
-
- // [*Output only*] If set, returns a [google.rpc.Status][google.rpc.Status] message that
- // specifies the error for the operation.
- google.rpc.Status error = 1;
-
- // [*Output only*] This repeated list contains zero or more results that
- // correspond to consecutive portions of the audio currently being processed.
- // It contains zero or one `is_final=true` result (the newly settled portion),
- // followed by zero or more `is_final=false` results (the interim results).
- repeated StreamingRecognitionResult results = 2;
-
- // [*Output only*] Indicates the type of speech event.
- SpeechEventType speech_event_type = 4;
-}
-
-// A streaming speech recognition result corresponding to a portion of the audio
-// that is currently being processed.
-message StreamingRecognitionResult {
- // [**Extension by Techmo**]
- // Indicates the cause of recognition result finalization. These are MRCPv2-related.
- // See [Completion-Cause](https://tools.ietf.org/html/rfc6787#section-9.4.11).
- enum ResultFinalizationCause {
- // No recognition result finalization cause specified.
- RESULT_FINALIZATION_CAUSE_UNSPECIFIED = 0;
-
- // Recognition has been finalized with a complete result
- // after specified length of silence after user speech.
- // See [Speech-Complete-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.15).
- SUCCESS = 1;
-
- // Recognition has started and there was no speech detected
- // for a certain period of time.
- // See [No-Input-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.6).
- NO_INPUT_TIMEOUT = 2;
-
- // Recognition has been finalized because speech was too long, with a complete result.
- // See [Recognition-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.7).
- SUCCESS_MAXTIME = 3;
-
- // Recognition has been finalized with an incomplete result
- // after specified length of silence after user speech.
- // See [Speech-Incomplete-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.16).
- PARTIAL_MATCH = 4;
-
- // Recognition has been finalized because speech was too long, with no result.
- // See [Recognition-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.7).
- NO_MATCH_MAXTIME = 5;
- }
-
- // [*Output only*] May contain one or more recognition hypotheses (up to the
- // maximum specified in `max_alternatives`).
- // These alternatives are ordered in terms of accuracy, with the top (first)
- // alternative being the most probable, as ranked by the recognizer.
- repeated SpeechRecognitionAlternative alternatives = 1;
-
- // [*Output only*] If `false`, this `StreamingRecognitionResult` represents an
- // interim result that may change. If `true`, this is the final time the
- // speech service will return this particular `StreamingRecognitionResult`,
- // the recognizer will not return any further hypotheses for this portion of
- // the transcript and corresponding audio.
- bool is_final = 2;
-
- // @exclude [*Output only*] An estimate of the likelihood that the recognizer will not
- // change its guess about this interim result. Values range from 0.0
- // (completely unstable) to 1.0 (completely stable).
- // This field is only provided for interim results (`is_final=false`).
- // The default of 0.0 is a sentinel value indicating `stability` was not set.
-
- // [*Unused*]
- float stability = 3;
-
- // [**Extension by Techmo**]
- // [*Output only*] Indicates the cause of recognition result finalization.
- ResultFinalizationCause result_finalization_cause = 4;
-
- // [**Extension by Techmo**]
- // [*Output only*] Detailed recognition result (lattice).
- // Returned only when requested (`ConfigField`: return-lattice=true in
- // `RecognitionConfig` Message), only for final (`is_final = true`) results,
- // and only when it's allowed by licence.
- // When requested and not allowed by licence, [google.rpc.Code.FAILED_PRECONDITION]
- // will be returned.
- repeated RecognitionLattice lattice = 5;
-
- // [**Extension by Techmo**]
- // [*Output only*] Predicted gender of the speaker
- Gender gender = 6;
-
- // [**Extension by Techmo**]
- // [*Output only*] Predicted age of the speaker
- Age age = 7;
-}
-
-// A speech recognition result corresponding to a portion of the audio.
-message SpeechRecognitionResult {
- // [*Output only*] May contain one or more recognition hypotheses (up to the
- // maximum specified in `max_alternatives`).
- // These alternatives are ordered in terms of accuracy, with the top (first)
- // alternative being the most probable, as ranked by the recognizer.
- repeated SpeechRecognitionAlternative alternatives = 1;
-
- // [**Extension by Techmo**]
- // [*Output only*] Detailed recognition result (lattice).
- // Returned only when requested (`ConfigField`: return-lattice=true in
- // `RecognitionConfig` Message), only for final (`is_final = true`) results,
- // and only when it's allowed by licence.
- // When requested and not allowed by licence, [google.rpc.Code.FAILED_PRECONDITION]
- // will be returned.
- repeated RecognitionLattice lattice = 5;
-
- // [**Extension by Techmo**]
- // [*Output only*] Predicted gender of the speaker
- Gender gender = 6;
-
- // [**Extension by Techmo**]
- // [*Output only*] Predicted age of the speaker
- Age age = 7;
-}
-
-// Alternative hypotheses (a.k.a. n-best list).
-message SpeechRecognitionAlternative {
- // [*Output only*] Transcript text representing the words that the user spoke.
- string transcript = 1;
-
- // [*Output only*] The confidence estimate between 0.0 and 1.0. A higher number
- // indicates an estimated greater likelihood that the recognized words are
- // correct.
- float confidence = 2;
-
- // @exclude This field is set only for the top alternative of a non-streaming
- // result or, of a streaming result where `is_final=true`.
- // This field is not guaranteed to be accurate and users should not rely on it
- // to be always provided.
- // The default of 0.0 is a sentinel value indicating `confidence` was not set.
-
- // [*Output only*] A list of word-specific information for each recognized word.
- repeated WordInfo words = 3;
-}
-
-// Word-specific information for recognized words. Word information is only
-// included in the response when certain request parameters are set, such
-// as `enable_word_time_offsets`.
-message WordInfo {
- // @exclude [*Output only*] Time offset relative to the beginning of the audio,
- // and corresponding to the start of the spoken word.
- // This field is only set if `enable_word_time_offsets=true` and only
- // in the top hypothesis.
- // This is an experimental feature and the accuracy of the time offset can
- // vary.
-
- // [*Output only*] Time offset relative to the beginning of the audio,
- // and corresponding to the start of the spoken word.
- // This field is only set if `enable_word_time_offsets=true`.
- google.protobuf.Duration start_time = 1;
-
- // @exclude [*Output only*] Time offset relative to the beginning of the audio,
- // and corresponding to the end of the spoken word.
- // This field is only set if `enable_word_time_offsets=true` and only
- // in the top hypothesis.
- // This is an experimental feature and the accuracy of the time offset can
- // vary.
-
- // [*Output only*] Time offset relative to the beginning of the audio,
- // and corresponding to the end of the spoken word.
- // This field is only set if `enable_word_time_offsets=true`.
- google.protobuf.Duration end_time = 2;
-
- // [*Output only*] The word corresponding to this set of information.
- string word = 3;
-}
-
-// [**Extension by Techmo**]
-// Detailed recognition result (lattice).
-// Returned only when requested (`ConfigField`: return-lattice=true in
-// `RecognitionConfig` Message), only for final (`is_final = true`) results,
-// and only when it's allowed by licence. When requested and not allowed by
-// licence, [google.rpc.Code.FAILED_PRECONDITION] will be returned.
-message RecognitionLattice {
- // List of final nodes.
- repeated int32 final_nodes = 1;
-
- // List of lattice edges.
- repeated LatticeEdge edges = 2;
-}
-
-// [**Extension by Techmo**]
-// Edge-specific information for recognition lattice.
-message LatticeEdge {
- // Input node ID, node '0' is starting node for the lattice.
- int32 start_node = 1;
-
- // End node ID.
- int32 end_node = 2;
-
- // Word.
- string symbol = 3;
-
- // Language model cost.
- float language_cost = 4;
-
- // Raw acoustic score (unscaled).
- float acoustic_cost = 5;
-
- // Word duration in milliseconds.
- int32 duration = 6;
-}
-
-// [**Extension by Techmo**]
-// Predicted gender of the speaker
-message Gender {
- // The recognized gender label.
- string gender = 1;
-
- // The confidence in [0, 1] range, where near 0 means 'unsure' and near 1 means 'almost certain'.
- float confidence = 2;
-}
-
-// [**Extension by Techmo**]
-// Predicted age of the speaker
-message Age {
- // The recognized age, in years.
- int32 age = 1;
-
- // The confidence in [0, 1] range, where near 0 means 'unsure' and near 1 means 'almost certain'.
- float confidence = 2;
-}
diff --git a/proto/techmo/asr/api/v1/asr.proto b/proto/techmo/asr/api/v1/asr.proto
deleted file mode 100644
index a83f414..0000000
--- a/proto/techmo/asr/api/v1/asr.proto
+++ /dev/null
@@ -1,436 +0,0 @@
-// Copyright 2023 Techmo sp. z o.o.
-
-syntax = "proto3";
-
-package techmo.asr.api.v1;
-
-import "google/protobuf/duration.proto";
-import "techmo/api/status.proto";
-
-
-// An automatic speech recognition (ASR) service providing a solution for
-// speech-to-text conversion extended by the assessment of additional speech
-// and speaker features.
-service Asr {
- // Perform bidirectional streaming recognition.
- rpc StreamingRecognize(stream StreamingRecognizeRequest)
- returns (stream StreamingRecognizeResponse) {}
-}
-
-// A message streamed from the client through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeRequest {
- oneof request_content {
- // The immutable initial configuration of the request.
- // Must be sent once in the request's first message.
- StreamingRecognizeRequestConfig config = 1;
-
- // The message controlling the processing flow of the request.
- // May be sent multiple times except in the request's first message.
- StreamingRecognizeRequestControlMessage control_message = 2;
-
- // The data contents of the request itself.
- // May be sent multiple times except in the request's first message.
- StreamingRecognizeRequestData data = 3;
- }
-}
-
-// A message holding configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestConfig {
- // Part of the configuration for the request's audio content.
- AudioConfig audio_config = 1;
-
- // Part of the configuration for the request's result form.
- ResultConfig result_config = 2;
-
- // Part of the configuration for the request's processing flow.
- StreamingConfig streaming_config = 3;
-
- // Part of the configuration for speech recognition.
- SpeechRecognitionConfig speech_recognition_config = 4;
-
- // Part of the configuration for age recognition.
- AgeRecognitionConfig age_recognition_config = 5;
-
- // Part of the configuration for gender recognition.
- GenderRecognitionConfig gender_recognition_config = 6;
-
- // Part of the configuration for language recognition.
- LanguageRecognitionConfig language_recognition_config = 7;
-}
-
-// Result configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message ResultConfig {
- // The switch that toggles continuous recognition into single utterance mode.
- // The service returns a final result for each end of utterance it detects in
- // the audio, which may occur multiple times during a request.
- // If enabled, the request terminates right after its first final result.
- bool enable_single_utterance = 1;
-
- // The switch that allows interim results.
- // If enabled, results containing tentative hypotheses may be returned in
- // addition to final ones.
- // The service should silently ignore this field if it is unsupported.
- bool enable_interim_results = 2;
-}
-
-// Streaming configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingConfig {
- reserved 1; // bool enable_single_utterance = 1;
-
- // The switch that enables manual control of the input timer.
- // The timer imposes two constraints: one that finalizes recognition after
- // a specified period unless speech is detected, and the other that limits
- // the total time for an utterance. Manual control allows recognition to
- // begin but delays enforcement of these constraints. The timer restarts
- // after each detected end of utterance (each final result).
- // If enabled, the timer does not start automatically. Instead, it can be
- // initiated by sending
- // a [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage)
- // with the `start_input_timer` field set to `true` as needed. This should
- // occur after the beginning of the request and be repeated after each final
- // result.
- bool enable_manual_input_timer = 2;
-}
-
-// Audio configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message AudioConfig {
- // The possible audio encodings.
- enum AudioEncoding {
- // Unspecified audio encoding.
- UNSPECIFIED = 0;
-
- // Linear pulse-code modulation of uncompressed 16-bit signed little-endian
- // samples.
- LINEAR16 = 1;
-
- // Free Lossless Audio Codec ([FLAC](https://wiki.xiph.org/FLAC)).
- // The encoding requires only about half the bandwidth of `LINEAR16`.
- // 16-bit and 24-bit samples. Not all fields in `STREAMINFO` are supported.
- // When set, the service ignores the `sampling_rate_hz` field and detects
- // the actual value from audio header instead.
- FLAC = 2;
-
- // Ogg Encapsulated Opus Audio Codec ([OggOpus](https://wiki.xiph.org/OggOpus)).
- // When set, the service ignores the `sampling_rate_hz` field and detects
- // the actual value from audio header instead.
- OGG_OPUS = 6;
-
- // MP3 (ISO/IEC 11172-3 and ISO/IEC 13818-3).
- // Only constant bitrate.
- // When set, the service ignores the `sampling_rate_hz` field and detects
- // the actual value from audio header instead.
- MP3 = 8;
- }
-
- // The encoding of the audio data sent in the request. Single channel (mono)
- // audio is assumed.
- // The service should respond with the `INVALID_ARGUMENT` gRPC status code
- // if the encoding is `UNSPECIFIED`.
- // The service should respond with the `FAILED_PRECONDITION` gRPC status code
- // if the encoding is not supported.
- AudioEncoding encoding = 1;
-
- // The sampling rate of the audio data sent in the request.
- // The service should silently ignore the field for encodings that are sent
- // along wtih headers, and detect the value from them instead.
- // The service should respond with the `INVALID_ARGUMENT` gRPC status code
- // if the value is not greater than 0.
- float sampling_rate_hz = 2;
-}
-
-// Configuration of age recognition.
-message AgeRecognitionConfig {
- // The switch that enables age recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_age_recognition = 1;
-}
-
-// Configuration of gender recognition.
-message GenderRecognitionConfig {
- // The switch that enables gender recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_gender_recognition = 1;
-}
-
-// Configuration of language recognition.
-message LanguageRecognitionConfig {
- // The switch that enables language recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_language_recognition = 1;
-}
-
-// Configuration for speech recognition.
-message SpeechRecognitionConfig {
- // The switch that enables speech recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_speech_recognition = 1;
-
- // The maximum number of alternative transcriptions allowed to be included
- // per response.
- // The actual count received can be less than the specified value and may
- // also be equal to 0. If unspecified or 0, one alternative is allowed to be
- // returned too.
- uint32 recognition_alternatives_limit = 2;
-
- // The switch that enables additional time alignment of recognitions in word
- // details.
- // If enabled, the `words` field of
- // a [`SpeechRecognitionAlternative`](#SpeechRecognitionAlternative) message
- // includes a list of [`SpeechRecognitionWord`](#SpeechRecognitionWord)
- // messages. Otherwise, it remains empty.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_time_alignment = 3;
-
- // The name of a language group of models to be used.
- // If left unspecified, it backs to the service's default group.
- // The service responds with the `NOT_FOUND` gRPC status code
- // if the name is not registered.
- string language_group_name = 4;
-
- // The name of a model to be used.
- // If left unspecified, it backs to the selected langugage group's default.
- // The service responds with the `NOT_FOUND` gRPC status code
- // if the name is not registered.
- string model_name = 5;
-
- // Deprecated.
- // The additional advanced service-dependend configuration for its speech
- // recognizer. It may be silently ignored.
- map config_fields = 6;
-}
-
-// A message controlling the processing flow of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestControlMessage {
- // The flag that starts the input timer on demand and resets after each final
- // result. It is silently ignored if the manual input timer setting is
- // disabled for the request.
- optional bool start_input_timer = 1;
-}
-
-// A message that carries data contents of
-// a [`StreamingRecognizeRequest`](#StreamingRecognizeRequest) request.
-message StreamingRecognizeRequestData {
- // Part of the audio to perform recognition on.
- Audio audio = 1;
-}
-
-// Audio contents.
-message Audio {
- oneof audio_content {
- // The audio data bytes.
- bytes bytes = 1;
- }
-}
-
-// A message streamed from the service through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeResponse {
- // The combined recognition results for another part of the audio.
- StreamingRecognizeResult result = 1;
-
- // The cumulative duration of the processed audio during the request,
- // not necessarily matching the actual length of the sent audio, mandatorily
- // updated with each final result.
- google.protobuf.Duration processed_audio_duration = 2;
-}
-
-// Combined recognition result.
-message StreamingRecognizeResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The flag indicating whether the result is interim or final.
- bool is_final = 2;
-
- // The anticipated causes for the service to finalize a result.
- enum ResultFinalizationCause {
- // The cause is not specified.
- UNSPECIFIED = 0;
-
- // The speech recognition result is not empty and the end of utterance
- // is detected.
- SUCCESS = 1;
-
- // The speech recognition result is empty after the duration to expect
- // a result is reached.
- NO_INPUT_TIMEOUT = 2;
-
- // The speech recognition result is not empty after the utterance duration
- // limit is reached. The returned speech recognition is incomplete and
- // should be completed in the following result.
- SUCCESS_MAXTIME = 3;
-
- // Unused.
- PARTIAL_MATCH = 4;
-
- // The speech recognition result is empty after the utterance duration
- // limit is reached.
- NO_MATCH_MAXTIME = 5;
- }
-
- // The field indicating the cause of result finalization.
- // For interim results, the service should leave the field as `UNSPECIFIED`.
- // For final results, the service must set the field to a value other than
- // `UNSPECIFIED`.
- ResultFinalizationCause result_finalization_cause = 3;
-
- // The speech recognition result for another part of the processed audio,
- // new with each final result, updates with each interim one.
- // To obtain a complete result for all processed audio, for each final result
- // received, a client should pick one of the result's recognition alternatives
- // and buffer it on its own.
- // It must be omitted if speech recognition is disabled.
- SpeechRecognitionResult speech_recognition_result = 4;
-
- // The current age recognition result for all processed audio,
- // updated with each final result.
- // It may be omitted in an interim result and must be omitted if age
- // recognition is disabled.
- AgeRecognitionResult age_recognition_result = 5;
-
- // The current gender recognition result for all processed audio,
- // updated with each final result.
- // It may be omitted in an interim result and must be omitted if gender
- // recognition is disabled.
- GenderRecognitionResult gender_recognition_result = 6;
-
- // The current language recognition result for all processed audio,
- // updated with each final result.
- // It may be omitted in an interim result and must be omitted if language
- // recognition is disabled.
- LanguageRecognitionResult language_recognition_result = 7;
-}
-
-// A result of age recognition.
-message AgeRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated AgeRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of age recognition.
-message AgeRecognitionAlternative {
- // The assumed age of the person speaking in the audio, in years.
- // For a reliable value, assure that there is only one person speaking in
- // the audio.
- uint32 age = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-}
-
-// A result of gender recognition.
-message GenderRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated GenderRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of gender recognition.
-message GenderRecognitionAlternative {
- // The assumed gender of the person speaking in the audio.
- // For a reliable value, assure that there is only one person speaking in
- // the audio.
- string gender = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-}
-
-// A result of language recognition.
-message LanguageRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated LanguageRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of language recognition.
-message LanguageRecognitionAlternative {
- // The language spoken in the audio,
- // a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) tag.
- string language = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-}
-
-// A result of speech recognition.
-message SpeechRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated SpeechRecognitionAlternative recognition_alternatives = 2;
-
- // The actual name of the language group of the model,
- // unrelated to the actual language spoken in the audio.
- string language_group_name = 3;
-
- // The actual name of the model used to obtain the result.
- string model_name = 4;
-}
-
-// An alternative hypothesis of speech recognition.
-message SpeechRecognitionAlternative {
- // The transcript of the audio.
- string transcript = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-
- // The details of the transcript's words.
- // Empty unless `enable_time_alignment` is `true` in the request's
- // [`SpeechRecognitionConfig`](#SpeechRecognitionConfig).
- repeated SpeechRecognitionWord words = 3;
-}
-
-// Details of a single word in speech recognition.
-message SpeechRecognitionWord {
- // The transcript of the word itself.
- string transcript = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-
- // The start time of the word relative to the beginning of the entire audio.
- google.protobuf.Duration start_time = 3;
-
- // The end time of the word relative to the beginning of the entire audio.
- google.protobuf.Duration end_time = 4;
-}
diff --git a/proto/techmo/asr/api/v1p1/asr.proto b/proto/techmo/asr/api/v1p1/asr.proto
deleted file mode 100644
index f2af15a..0000000
--- a/proto/techmo/asr/api/v1p1/asr.proto
+++ /dev/null
@@ -1,481 +0,0 @@
-// Copyright 2023 Techmo sp. z o.o.
-
-syntax = "proto3";
-
-package techmo.asr.api.v1p1;
-
-import "google/protobuf/duration.proto";
-import "techmo/api/status.proto";
-
-
-// An automatic speech recognition (ASR) service providing a solution for
-// speech-to-text conversion extended by the assessment of additional speech
-// and speaker features.
-service Asr {
- // Perform bidirectional streaming recognition.
- rpc StreamingRecognize(stream StreamingRecognizeRequest)
- returns (stream StreamingRecognizeResponse) {}
-}
-
-// A message streamed from the client through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeRequest {
- oneof request_content {
- // The immutable initial configuration of the request.
- // Must be sent once in the request's first message.
- StreamingRecognizeRequestConfig config = 1;
-
- // The message controlling the processing flow of the request.
- // May be sent multiple times except in the request's first message.
- StreamingRecognizeRequestControlMessage control_message = 2;
-
- // The data contents of the request itself.
- // May be sent multiple times except in the request's first message.
- StreamingRecognizeRequestData data = 3;
- }
-}
-
-// A message holding configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestConfig {
- // Part of the configuration for the request's audio content.
- AudioConfig audio_config = 1;
-
- // Part of the configuration for the request's result form.
- ResultConfig result_config = 2;
-
- // Part of the configuration for the request's processing flow.
- StreamingConfig streaming_config = 3;
-
- // Part of the configuration for speech recognition.
- SpeechRecognitionConfig speech_recognition_config = 4;
-
- // Part of the configuration for age recognition.
- AgeRecognitionConfig age_recognition_config = 5;
-
- // Part of the configuration for gender recognition.
- GenderRecognitionConfig gender_recognition_config = 6;
-
- // Part of the configuration for language recognition.
- LanguageRecognitionConfig language_recognition_config = 7;
-}
-
-// Result configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message ResultConfig {
- // The switch that toggles continuous recognition into single utterance mode.
- // The service returns a final result for each end of utterance it detects in
- // the audio, which may occur multiple times during a request.
- // If enabled, the request terminates right after its first final result.
- bool enable_single_utterance = 1;
-
- // The switch that allows interim results.
- // If enabled, results containing tentative hypotheses may be returned in
- // addition to final ones.
- // The service should silently ignore this field if it is unsupported.
- bool enable_interim_results = 2;
-
- // The switch to allow the service merging responses in the "hold response"
- // state.
- // If enabled and there is more than a single response held, the service does
- // not return them in a batch. Instead, it tries to merge their results into
- // a single response.
- // The service should respond with the `INVALID_ARGUMENT` gRPC status code
- // if the `recognition_alternatives_limit` field
- // of the [`SpeechRecognitionConfig`](#SpeechRecognitionConfig) message is
- // greater than 1.
- // New in v1p1.
- bool enable_held_responses_merging = 3;
-}
-
-// Streaming configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingConfig {
- reserved 1; // bool enable_single_utterance = 1;
-
- // The switch that enables manual control of the input timer.
- // The timer imposes two constraints: one that finalizes recognition after
- // a specified period unless speech is detected, and the other that limits
- // the total time for an utterance. Manual control allows recognition to
- // begin but delays enforcement of these constraints. The timer restarts
- // after each detected end of utterance (each final result).
- // If enabled, the timer does not start automatically. Instead, it can be
- // initiated by sending
- // a [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage)
- // with the `start_input_timer` field set to `true` as needed. This should
- // occur after the beginning of the request and be repeated after each final
- // result.
- bool enable_manual_input_timer = 2;
-
- // The switch to automatically set the service in the "hold response" state
- // at the beginning of the request and after each final result.
- // The "hold response" state means that the internal recognition process
- // continues, but results are kept, not returned. When needed, the state can
- // be toggled into the "give response" state by sending
- // the [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage)
- // message with the `give_response` field set to `true`.
- // In the "give response" state the service responds as soon as it is ready.
- // Any held responses may be returned in a batch or as a single merged
- // response, provided that the `enable_held_responses_merging` field
- // of the [`ResultConfig`](#ResultConfig) message is set to `true`.
- // New in v1p1.
- bool enable_auto_hold_response = 3;
-}
-
-// Audio configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message AudioConfig {
- // The possible audio encodings.
- enum AudioEncoding {
- // Unspecified audio encoding.
- UNSPECIFIED = 0;
-
- // Linear pulse-code modulation of uncompressed 16-bit signed little-endian
- // samples.
- LINEAR16 = 1;
-
- // Free Lossless Audio Codec ([FLAC](https://wiki.xiph.org/FLAC)).
- // The encoding requires only about half the bandwidth of `LINEAR16`.
- // 16-bit and 24-bit samples. Not all fields in `STREAMINFO` are supported.
- // When set, the service ignores the `sampling_rate_hz` field and detects
- // the actual value from audio header instead.
- FLAC = 2;
-
- // Ogg Encapsulated Opus Audio Codec ([OggOpus](https://wiki.xiph.org/OggOpus)).
- // When set, the service ignores the `sampling_rate_hz` field and detects
- // the actual value from audio header instead.
- OGG_OPUS = 6;
-
- // MP3 (ISO/IEC 11172-3 and ISO/IEC 13818-3).
- // Only constant bitrate.
- // When set, the service ignores the `sampling_rate_hz` field and detects
- // the actual value from audio header instead.
- MP3 = 8;
- }
-
- // The encoding of the audio data sent in the request. Single channel (mono)
- // audio is assumed.
- // The service should respond with the `INVALID_ARGUMENT` gRPC status code
- // if the encoding is `UNSPECIFIED`.
- // The service should respond with the `FAILED_PRECONDITION` gRPC status code
- // if the encoding is not supported.
- AudioEncoding encoding = 1;
-
- // The sampling rate of the audio data sent in the request.
- // The service should silently ignore the field for encodings that are sent
- // along wtih headers, and detect the value from them instead.
- // The service should respond with the `INVALID_ARGUMENT` gRPC status code
- // if the value is not greater than 0.
- float sampling_rate_hz = 2;
-}
-
-// Configuration of age recognition.
-message AgeRecognitionConfig {
- // The switch that enables age recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_age_recognition = 1;
-}
-
-// Configuration of gender recognition.
-message GenderRecognitionConfig {
- // The switch that enables gender recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_gender_recognition = 1;
-}
-
-// Configuration of language recognition.
-message LanguageRecognitionConfig {
- // The switch that enables language recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_language_recognition = 1;
-}
-
-// Configuration for speech recognition.
-message SpeechRecognitionConfig {
- // The switch that enables speech recognition for the request.
- // If disabled or unspecified, the related results are excluded.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_speech_recognition = 1;
-
- // The maximum number of alternative transcriptions allowed to be included
- // per response.
- // The actual count received can be less than the specified value and may
- // also be equal to 0. If unspecified or 0, one alternative is allowed to be
- // returned too.
- uint32 recognition_alternatives_limit = 2;
-
- // The switch that enables additional time alignment of recognitions in word
- // details.
- // If enabled, the `words` field of
- // a [`SpeechRecognitionAlternative`](#SpeechRecognitionAlternative) message
- // includes a list of [`SpeechRecognitionWord`](#SpeechRecognitionWord)
- // messages. Otherwise, it remains empty.
- // The service responds with the `FAILED_PRECONDITION` gRPC status code
- // if requested but not enabled.
- bool enable_time_alignment = 3;
-
- // The name of a language group of models to be used.
- // If left unspecified, it backs to the service's default group.
- // The service responds with the `NOT_FOUND` gRPC status code
- // if the name is not registered.
- string language_group_name = 4;
-
- // The name of a model to be used.
- // If left unspecified, it backs to the selected langugage group's default.
- // The service responds with the `NOT_FOUND` gRPC status code
- // if the name is not registered.
- string model_name = 5;
-
- // Deprecated.
- // The additional advanced service-dependend configuration for its speech
- // recognizer. It may be silently ignored.
- map config_fields = 6;
-}
-
-// A message controlling the processing flow of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestControlMessage {
- reserved 2;
-
- oneof control_message_content
- {
- // The flag that starts the input timer on demand and resets after each final
- // result. It is silently ignored if the manual input timer setting is
- // disabled for the request.
- bool start_input_timer = 1;
-
- // The flag to allow the service to return a response.
- // After receiving this message, the service remains in the "give response"
- // state. Ignored when the service is already in the "give response" state.
- // Mutually exclusive with the `hold_response` field.
- // New in v1p1.
- bool give_response = 3;
-
- // The flag to forbid the service from returning a response.
- // After receiving this message, the service remains in the "hold response"
- // state. Ignored when the service is already in the "hold response" state.
- // Mutually exclusive with the `give_response` field.
- // New in v1p1.
- bool hold_response = 4;
- }
-}
-
-// A message that carries data contents of
-// a [`StreamingRecognizeRequest`](#StreamingRecognizeRequest) request.
-message StreamingRecognizeRequestData {
- // Part of the audio to perform recognition on.
- Audio audio = 1;
-}
-
-// Audio contents.
-message Audio {
- oneof audio_content {
- // The audio data bytes.
- bytes bytes = 1;
- }
-}
-
-// A message streamed from the service through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeResponse {
- // The combined recognition results for another part of the audio.
- StreamingRecognizeResult result = 1;
-
- // The cumulative duration of the processed audio during the request,
- // not necessarily matching the actual length of the sent audio, mandatorily
- // updated with each final result.
- google.protobuf.Duration processed_audio_duration = 2;
-}
-
-// Combined recognition result.
-message StreamingRecognizeResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The flag indicating whether the result is interim or final.
- bool is_final = 2;
-
- // The anticipated causes for the service to finalize a result.
- enum ResultFinalizationCause {
- // The cause is not specified.
- UNSPECIFIED = 0;
-
- // The speech recognition result is not empty and the end of utterance
- // is detected.
- SUCCESS = 1;
-
- // The speech recognition result is empty after the duration to expect
- // a result is reached.
- NO_INPUT_TIMEOUT = 2;
-
- // The speech recognition result is not empty after the utterance duration
- // limit is reached. The returned speech recognition is incomplete and
- // should be completed in the following result.
- SUCCESS_MAXTIME = 3;
-
- // Unused.
- PARTIAL_MATCH = 4;
-
- // The speech recognition result is empty after the utterance duration
- // limit is reached.
- NO_MATCH_MAXTIME = 5;
- }
-
- // The field indicating the cause of result finalization.
- // For interim results, the service should leave the field as `UNSPECIFIED`.
- // For final results, the service must set the field to a value other than
- // `UNSPECIFIED`.
- ResultFinalizationCause result_finalization_cause = 3;
-
- // The speech recognition result for another part of the processed audio,
- // new with each final result, updates with each interim one.
- // To obtain a complete result for all processed audio, for each final result
- // received, a client should pick one of the result's recognition alternatives
- // and buffer it on its own.
- // It must be omitted if speech recognition is disabled.
- SpeechRecognitionResult speech_recognition_result = 4;
-
- // The current age recognition result for all processed audio,
- // updated with each final result.
- // It may be omitted in an interim result and must be omitted if age
- // recognition is disabled.
- AgeRecognitionResult age_recognition_result = 5;
-
- // The current gender recognition result for all processed audio,
- // updated with each final result.
- // It may be omitted in an interim result and must be omitted if gender
- // recognition is disabled.
- GenderRecognitionResult gender_recognition_result = 6;
-
- // The current language recognition result for all processed audio,
- // updated with each final result.
- // It may be omitted in an interim result and must be omitted if language
- // recognition is disabled.
- LanguageRecognitionResult language_recognition_result = 7;
-}
-
-// A result of age recognition.
-message AgeRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated AgeRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of age recognition.
-message AgeRecognitionAlternative {
- // The assumed age of the person speaking in the audio, in years.
- // For a reliable value, assure that there is only one person speaking in
- // the audio.
- uint32 age = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-}
-
-// A result of gender recognition.
-message GenderRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated GenderRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of gender recognition.
-message GenderRecognitionAlternative {
- // The assumed gender of the person speaking in the audio.
- // For a reliable value, assure that there is only one person speaking in
- // the audio.
- string gender = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-}
-
-// A result of language recognition.
-message LanguageRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated LanguageRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of language recognition.
-message LanguageRecognitionAlternative {
- // The language spoken in the audio,
- // a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) tag.
- string language = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-}
-
-// A result of speech recognition.
-message SpeechRecognitionResult {
- // The recognition process status.
- // It may communicate warnings. In case of an error hindering recognition,
- // all other message fields should be left unset.
- techmo.api.Status error = 1;
-
- // The confidence-ordered list of alternative recognition hypotheses.
- repeated SpeechRecognitionAlternative recognition_alternatives = 2;
-
- // The actual name of the language group of the model,
- // unrelated to the actual language spoken in the audio.
- string language_group_name = 3;
-
- // The actual name of the model used to obtain the result.
- string model_name = 4;
-}
-
-// An alternative hypothesis of speech recognition.
-message SpeechRecognitionAlternative {
- // The transcript of the audio.
- string transcript = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-
- // The details of the transcript's words.
- // Empty unless `enable_time_alignment` is `true` in the request's
- // [`SpeechRecognitionConfig`](#SpeechRecognitionConfig).
- repeated SpeechRecognitionWord words = 3;
-}
-
-// Details of a single word in speech recognition.
-message SpeechRecognitionWord {
- // The transcript of the word itself.
- string transcript = 1;
-
- // The confidence estimate, ranging from 0.0 to 1.0.
- // Support for this feature is optional.
- optional float confidence = 2;
-
- // The start time of the word relative to the beginning of the entire audio.
- google.protobuf.Duration start_time = 3;
-
- // The end time of the word relative to the beginning of the entire audio.
- google.protobuf.Duration end_time = 4;
-}
diff --git a/pyproject.toml b/pyproject.toml
index 873297f..3e5c133 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,23 @@
[build-system]
-requires = ["grpcio-tools>=1.49.4,<1.63", "setuptools>=61"]
+requires = ["grpcio-tools>=1.49.4,<1.71.0", "setuptools>=61"]
build-backend = "setuptools.build_meta"
[project]
name = "techmo-asr-api"
-description = "Techmo ASR API (public)"
+description = "Techmo ASR API"
dynamic = ["version"]
readme = { file = "README.md", content-type = "text/markdown" }
authors = [{ name = "Techmo sp. z o.o", email = "kontakt@techmo.pl" }]
requires-python = ">=3.8"
-dependencies = ["grpcio>=1.49.4,<1.63", "protobuf>=4.21.3,<5"]
+dependencies = [
+ "grpcio>=1.49.4,<1.71.0; python_version=='3.8'",
+ "grpcio>=1.49.4; python_version>='3.9'",
+ "protobuf>=4.21.3,<6.0.0; python_version=='3.8'",
+ "protobuf>=4.21.3; python_version>='3.9'",
+]
+
+[project.optional-dependencies]
+tests = ["pytest<8,>=7.4.4", "pytest-cov>=4.1", "pytest-lazy-fixture>=0.6.3"]
[project.urls]
repository = "https://github.com/techmo-pl/asr-api-python"
@@ -19,3 +27,28 @@ version = { attr = "asr_api.VERSION.__version__" }
[tool.setuptools.packages.find]
include = ["asr_api*", "google*", "techmo*"]
+
+[tool.pytest.ini_options]
+addopts = ["--strict-markers"]
+markers = ["""api(name): mark tests as defined for API. \
+ Example: api('techmo.asr.api.v1p1'). \
+ Use the `--api=` option to collect the marked tests."""]
+testpaths = ["tests"]
+
+[tool.coverage.report]
+precision = 1
+show_missing = true
+
+[tool.mypy]
+
+[[tool.mypy.overrides]]
+module = ["techmo.*", "google.*"]
+ignore_errors = true
+
+[[tool.mypy.overrides]]
+module = "tests.*"
+disallow_untyped_decorators = false
+
+[tool.coverage.run]
+source_pkgs = ["asr_api"]
+relative_files = true
diff --git a/setup.py b/setup.py
index 196ac3d..2ca8195 100644
--- a/setup.py
+++ b/setup.py
@@ -1,71 +1,90 @@
+import os
from pathlib import Path
-from typing import List
+from typing import Any, Optional, Sequence, Union
import setuptools
+_PathLike = Union[str, bytes, "os.PathLike[Any]"]
+_PathLikes = Sequence[_PathLike]
-def protoc(args: List[str]):
- import pkg_resources
+
+def _update_submodule(
+ submodule_path: _PathLike,
+ git_submodule_update_options: Sequence[str] = ("--init", "--depth", "1", "--"),
+ working_directory_path: Optional[_PathLike] = None,
+) -> None:
+ import subprocess
+
+ if (Path(str(working_directory_path) if working_directory_path else ".") / str(submodule_path) / ".git").exists():
+ return
+
+ if (
+ subprocess.call(
+ command := (("git", "submodule", "update") + tuple(git_submodule_update_options) + (str(submodule_path),)),
+ cwd=working_directory_path,
+ )
+ != 0
+ ):
+ raise Exception(f"error: {command} failed")
+
+
+def _protoc(*args: str) -> None:
+ import grpc_tools
from grpc_tools import protoc
- command = [
- "grpc_tools.protoc",
- "--proto_path={}".format(
- Path(pkg_resources.resource_filename("grpc_tools", "_proto"))
- ),
- ] + args
-
- if protoc.main(command) != 0:
- raise Exception("error: {} failed".format(command))
-
-
-def build_package_grpc_protos(
- protos_paths: List[Path], import_directory_paths: List[Path] = []
-):
- protoc(
- [
- "--proto_path={}".format(Path(import_directory_path))
- for import_directory_path in import_directory_paths
- ]
- + ["--grpc_python_out=."]
- + protos_paths,
+ if (
+ protoc.main(
+ command := (
+ "grpc_tools.protoc",
+ "--proto_path={}".format(Path(grpc_tools.__file__).parent / "_proto"),
+ )
+ + args
+ )
+ != 0
+ ):
+ raise Exception(f"error: {command} failed")
+
+
+def _build_package_grpc_protos(
+ proto_paths: _PathLikes,
+ import_directory_paths: Optional[_PathLikes] = None,
+) -> None:
+ _protoc(
+ *(f"--proto_path={str(import_directory_path)}" for import_directory_path in import_directory_paths or ()),
+ "--grpc_python_out=.",
+ *(str(proto_path) for proto_path in proto_paths),
)
-def build_package_protos(
- protos_paths: List[Path], import_directory_paths: List[Path] = []
-):
- protoc(
- [
- "--proto_path={}".format(Path(import_directory_path))
- for import_directory_path in import_directory_paths
- ]
- + ["--python_out=."]
- + protos_paths,
+def _build_package_protos(
+ proto_paths: _PathLikes,
+ import_directory_paths: Optional[_PathLikes] = None,
+) -> None:
+ _protoc(
+ *(f"--proto_path={str(import_directory_path)}" for import_directory_path in import_directory_paths or ()),
+ "--python_out=.",
+ *(str(proto_path) for proto_path in proto_paths),
)
-build_package_protos(
- protos_paths=[
- "./proto/google/rpc/status.proto",
- "./proto/techmo/api/status.proto",
- "./proto/techmo/asr/api/dictation/asr.proto",
- "./proto/techmo/asr/api/v1/asr.proto",
- "./proto/techmo/asr/api/v1p1/asr.proto",
- ],
- import_directory_paths=[
- "./proto",
- ],
+_update_submodule("./submodules/asr-api")
+_build_package_protos(
+ (
+ "./submodules/asr-api/proto/google/rpc/status.proto",
+ "./submodules/asr-api/proto/techmo/api/status.proto",
+ "./submodules/asr-api/proto/techmo/asr/api/dictation/asr.proto",
+ "./submodules/asr-api/proto/techmo/asr/api/v1/asr.proto",
+ "./submodules/asr-api/proto/techmo/asr/api/v1p1/asr.proto",
+ ),
+ import_directory_paths=("./submodules/asr-api/proto",),
)
-build_package_grpc_protos(
- protos_paths=[
- "./proto/techmo/asr/api/dictation/asr.proto",
- "./proto/techmo/asr/api/v1/asr.proto",
- "./proto/techmo/asr/api/v1p1/asr.proto",
- ],
- import_directory_paths=[
- "./proto",
- ],
+_build_package_grpc_protos(
+ (
+ "./submodules/asr-api/proto/techmo/asr/api/dictation/asr.proto",
+ "./submodules/asr-api/proto/techmo/asr/api/v1/asr.proto",
+ "./submodules/asr-api/proto/techmo/asr/api/v1p1/asr.proto",
+ ),
+ import_directory_paths=("./submodules/asr-api/proto",),
)
setuptools.setup()
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 0000000..f319aa5
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# usage: ./setup.sh
+#
+# Run once after cloning: initialises git submodules.
+
+set -euo pipefail
+
+git submodule sync --recursive
+git submodule update --init --recursive
diff --git a/submodules/asr-api b/submodules/asr-api
new file mode 160000
index 0000000..084c836
--- /dev/null
+++ b/submodules/asr-api
@@ -0,0 +1 @@
+Subproject commit 084c836bff448aff140dd2391499a297aacabc4f
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..7650569
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,22 @@
+from typing import List
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+ parser.addoption(
+ "--api",
+ default=None,
+ choices=[
+ "techmo.asr.api.dictation",
+ "techmo.asr.api.v1",
+ "techmo.asr.api.v1p1",
+ ],
+ help="the argument of tests marked with the `@pytest.mark.api(name)` marker to be collected; one of: %(choices)s (default is %(default)r)",
+ metavar="name",
+ )
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: List[pytest.Item]) -> None:
+ if api := config.getoption("--api"):
+ items[:] = (item for item in items if (mark := item.get_closest_marker("api")) and mark.args and mark.args[0] == api)
diff --git a/tests/test_asr_api.py b/tests/test_asr_api.py
new file mode 100644
index 0000000..2dcf141
--- /dev/null
+++ b/tests/test_asr_api.py
@@ -0,0 +1,140 @@
+from typing import cast
+
+import pytest
+
+
+@pytest.fixture
+def asr_api_dictation() -> object:
+ import asr_api.dictation
+
+ return asr_api.dictation
+
+
+@pytest.fixture(
+ params=(
+ "Speech",
+ "RecognizeRequest",
+ "StreamingRecognizeRequest",
+ "StreamingRecognitionConfig",
+ "RecognitionConfig",
+ "SpeechDurationConfig",
+ "SpeechDurationThresholdMode",
+ "SpeechContext",
+ "ConfigField",
+ "RecognitionAudio",
+ "RecognizeResponse",
+ "StreamingRecognizeResponse",
+ "StreamingRecognitionResult",
+ "SpeechRecognitionResult",
+ "SpeechRecognitionAlternative",
+ "WordInfo",
+ "RecognitionLattice",
+ "LatticeEdge",
+ "Gender",
+ "Age",
+ ),
+)
+def asr_api_dictation_attr(request: pytest.FixtureRequest) -> str:
+ return cast(str, request.param)
+
+
+@pytest.fixture
+def asr_api_v1() -> object:
+ import asr_api.v1
+
+ return asr_api.v1
+
+
+@pytest.fixture(
+ params=(
+ "Asr",
+ "StreamingRecognizeRequest",
+ "StreamingRecognizeRequestConfig",
+ "ResultConfig",
+ "StreamingConfig",
+ "AudioConfig",
+ "AgeRecognitionConfig",
+ "GenderRecognitionConfig",
+ "LanguageRecognitionConfig",
+ "SpeechRecognitionConfig",
+ "StreamingRecognizeRequestControlMessage",
+ "StreamingRecognizeRequestData",
+ "Audio",
+ "StreamingRecognizeResponse",
+ "StreamingRecognizeResult",
+ "AgeRecognitionResult",
+ "AgeRecognitionAlternative",
+ "GenderRecognitionResult",
+ "GenderRecognitionAlternative",
+ "LanguageRecognitionResult",
+ "LanguageRecognitionAlternative",
+ "SpeechRecognitionResult",
+ "SpeechRecognitionAlternative",
+ "SpeechRecognitionWord",
+ ),
+)
+def asr_api_v1_attr(request: pytest.FixtureRequest) -> str:
+ return cast(str, request.param)
+
+
+@pytest.fixture
+def asr_api_v1p1() -> object:
+ import asr_api.v1p1
+
+ return asr_api.v1p1
+
+
+@pytest.fixture(
+ params=(
+ "Asr",
+ "StreamingRecognizeRequest",
+ "StreamingRecognizeRequestConfig",
+ "ResultConfig",
+ "StreamingConfig",
+ "AudioConfig",
+ "AgeRecognitionConfig",
+ "GenderRecognitionConfig",
+ "LanguageRecognitionConfig",
+ "SpeechRecognitionConfig",
+ "StreamingRecognizeRequestControlMessage",
+ "StreamingRecognizeRequestData",
+ "Audio",
+ "StreamingRecognizeResponse",
+ "StreamingRecognizeResult",
+ "AgeRecognitionResult",
+ "AgeRecognitionAlternative",
+ "GenderRecognitionResult",
+ "GenderRecognitionAlternative",
+ "LanguageRecognitionResult",
+ "LanguageRecognitionAlternative",
+ "SpeechRecognitionResult",
+ "SpeechRecognitionAlternative",
+ "SpeechRecognitionWord",
+ ),
+)
+def asr_api_v1p1_attr(request: pytest.FixtureRequest) -> str:
+ return cast(str, request.param)
+
+
+@pytest.mark.parametrize(
+ "api, attr",
+ (
+ pytest.param(
+ pytest.lazy_fixture("asr_api_dictation"),
+ pytest.lazy_fixture("asr_api_dictation_attr"),
+ marks=pytest.mark.api("techmo.asr.api.dictation"),
+ ),
+ pytest.param(
+ pytest.lazy_fixture("asr_api_v1"),
+ pytest.lazy_fixture("asr_api_v1_attr"),
+ marks=pytest.mark.api("techmo.asr.api.v1"),
+ ),
+ pytest.param(
+ pytest.lazy_fixture("asr_api_v1p1"),
+ pytest.lazy_fixture("asr_api_v1p1_attr"),
+ marks=pytest.mark.api("techmo.asr.api.v1p1"),
+ ),
+ ),
+)
+def test_hasattr(api: object, attr: str) -> None:
+ assert hasattr(api, attr)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..ce99428
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,36 @@
+[tox]
+# Python 3.7 is not downloadable via uv; minimum testable version is 3.8.
+# Python 3.14 is included to catch forward-compat issues early.
+envlist = py38, py39, py310, py311, py312, py313, py314
+requires =
+ tox-uv>=1
+
+[testenv]
+# skip_install keeps each env lightweight: the package is found via PYTHONPATH
+# rather than doing a full editable install in each env.
+# Proto stubs (techmo/asr/api/*_pb2.py) are gitignored — they are generated
+# by setup.py at build time. In CI the before_script runs ./install.sh first;
+# locally, run `./install.sh` first (requires grpcio-tools and the asr-api
+# submodule).
+skip_install = true
+set_env = PYTHONPATH = {toxinidir}
+# Pass service-address variables so integration tests can connect to a live service
+# when run via tox (e.g. tox -e py312 -- -m integration).
+passenv =
+ ASR_*
+deps =
+ # grpcio 1.71.0 dropped Python 3.8
+ grpcio>=1.49.4,<1.71.0; python_version=="3.8"
+ grpcio>=1.49.4; python_version>="3.9"
+ protobuf>=4.21.3,<6.0.0; python_version=="3.8"
+ protobuf>=4.21.3; python_version>="3.9"
+ pytest>=7.4.4,<8
+ pytest-cov>=4.1
+ pytest-lazy-fixture>=0.6.3
+commands_pre =
+ # Abort early with a clear message if proto stubs are missing rather than
+ # letting pytest fail with a cryptic ImportError deep in imports.
+ # PYTHONPATH already contains {toxinidir} so we use it to locate the stub.
+ python -c "import os, sys; stub = os.path.join(os.environ['PYTHONPATH'], 'techmo', 'asr', 'api', 'dictation', 'asr_pb2.py'); sys.exit(0) if os.path.exists(stub) else sys.exit('Proto stubs missing. Run: ./install.sh')"
+commands =
+ pytest --color=yes --cov=asr_api --cov-report=term-missing --cov-report=xml:{envtmpdir}/coverage.xml {posargs}