diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..48dadd7
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,55 @@
+name: Tests
+
+on:
+  push:
+    branches: ["main", "master"]
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  test:
+    name: Python ${{ matrix.python-version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          cache-dependency-glob: "pyproject.toml"
+
+      - name: Generate proto stubs and install package
+        run: ./install.sh
+
+      - name: Run tests via tox
+        run: |
+          TOXENV="py$(echo '${{ matrix.python-version }}' | tr -d '.')"
+          uvx --with "tox-uv>=1" tox -e "${TOXENV}"
+
+  test-py314:
+    name: Python 3.14 (allowed failure)
+    runs-on: ubuntu-latest
+    continue-on-error: true
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          cache-dependency-glob: "pyproject.toml"
+
+      - name: Generate proto stubs and install package
+        run: ./install.sh
+
+      - name: Run tests via tox
+        run: uvx --with "tox-uv>=1" tox -e py314
diff --git a/.gitignore b/.gitignore
index ccdbf67..6262d55 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,10 @@
 build/
 *.egg-info/
 .eggs/
+.tox/
 google/
 __pycache__/
 techmo/
 .venv/
 .vscode/
+pre-commit/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..20f982f
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "submodules/asr-api"]
+	path = submodules/asr-api
+	url = https://github.com/techmo-pl/asr-api.git
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3c71927..d75ed34 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,10 +1,28 @@
 # Changelog of ASR API (Python)
 
-## [1.0.0] - 2024-08-14
+
+## [1.1.4] - 2026-03-22
+
+### Fixed
+
+- `install.sh`: added `export PATH="$HOME/.local/bin:$PATH"` so that `uv` is found on runners where it is installed locally rather than system-wide.
+- `VERSION.py`: corrected version string (was not updated when 1.1.3 was tagged).
+
+### Changed
+
+- `setup.py`: replaced `pkg_resources` with importlib-compatible path resolution; removed upper bound on setuptools; removed upper bound on grpcio-tools build requirement.
+- `pyproject.toml`: removed upper bound on grpcio and protobuf runtime requirements; added Python-version markers to guard Python 3.8 users from grpcio>=1.71.0 and protobuf>=6.0.0; grpcio bounds set to `>=1.49.4,<1.71.0` for Python 3.8 and `>=1.49.4` for 3.9+; protobuf bounds set to `>=4.21.3,<6`; `requires-python` lowered to `>=3.8`; introduced upper bound on setuptools below 82; added `pip<26` constraint.
+- `tox.ini`, `install.sh`: introduced uv-based multi-version testing (Python 3.8–3.14); replaced Docker-based single-version test with tox multi-version matrix.
+- `submodules/asr-api`: updated to v1.1.1; restructured from committed proto files to a submodule.
+- `asr_api/`: support for _techmo.asr.api.v1p1_ API.
+- `tests/`: attribute check for _techmo.asr.api.v1p1_ API.
+
+
+## [1.0.0] - 2024-01-29
 
 ### Added
 
-- _asr_api_ package
-  - support for _techmo.asr.api.dictation_ API
-  - support for _techmo.asr.api.v1_ API
-- Setuptools configuration
+- `asr_api/`: support for _techmo.asr.api.dictation_ and _techmo.asr.api.v1_ APIs.
+- `pyproject.toml`: setuptools configuration.
+- `tests/`: attribute checks for _techmo.asr.api.dictation_ and _techmo.asr.api.v1_ APIs; coverage report.
+- `submodules/asr-api`: asr-api v1.0.0.
diff --git a/README.md b/README.md
index 2b90047..38d64d9 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,16 @@
 # ASR API (Python)
 
-The collection of gRPC APIs for Techmo ASR supplied as a Python package.
+The collection of gRPC APIs for Techmo ASR solutions supplied as a Python package.
 
 ## Setup
 
-The project can be used as-is and does not require any additional setup.
+Run once after cloning to initialise the submodule:
 
-## Requirements
+```sh
+./setup.sh
+```
+
+### Requirements
 
 - [Python](https://www.python.org/) >=3.8
 
@@ -23,6 +27,23 @@ pip install --require-virtualenv --upgrade pip
 pip install --require-virtualenv .
 ```
 
+*For basic development use, consider convenient `./install.sh`.*
+
+## Running tests
+
+Proto stubs must be generated before running tests. Use `./install.sh` once, then invoke tox:
+
+```sh
+./install.sh
+uvx --with "tox-uv>=1" tox
+```
+
+To run a single Python version:
+
+```sh
+uvx --with "tox-uv>=1" tox -e py312
+```
+
 ## Usage
 
 ### Import
@@ -31,6 +52,14 @@ The package provides a precompiled collection of `.proto` files that can be impo
 
 Example:
 
+- direct import
+
+```python
+>>> from techmo.asr.api.v1p1 import asr_pb2 as api
+>>> hasattr(api, "StreamingRecognizeRequest")
+True
+```
+
 - import from an alias module
 
 ```python
diff --git a/VERSION.md b/VERSION.md
deleted file mode 100644
index 3eefcb9..0000000
--- a/VERSION.md
+++ /dev/null
@@ -1 +0,0 @@
-1.0.0
diff --git a/asr_api/VERSION.py b/asr_api/VERSION.py
index 5becc17..c72e379 100644
--- a/asr_api/VERSION.py
+++ b/asr_api/VERSION.py
@@ -1 +1 @@
-__version__ = "1.0.0"
+__version__ = "1.1.4"
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..490190a
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# usage: ./install.sh [VENV_PATH]
+#
+# VENV_PATH: Optional path for the virtual environment (default: ./.venv).
+#
+# Creates a virtualenv with uv and installs the package with test dependencies.
+
+set -euo pipefail
+
+VENV_PATH="${1:-.venv}"
+
+if [ ! -d "${VENV_PATH}" ]; then
+    uv venv "${VENV_PATH}"
+fi
+
+# shellcheck disable=SC1091
+source "${VENV_PATH}/bin/activate"
+uv pip install -e ".[tests]"
diff --git a/proto/google/rpc/status.proto b/proto/google/rpc/status.proto
deleted file mode 100644
index 923e169..0000000
--- a/proto/google/rpc/status.proto
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2022 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-syntax = "proto3";
-
-package google.rpc;
-
-import "google/protobuf/any.proto";
-
-option cc_enable_arenas = true;
-option go_package = "google.golang.org/genproto/googleapis/rpc/status;status";
-option java_multiple_files = true;
-option java_outer_classname = "StatusProto";
-option java_package = "com.google.rpc";
-option objc_class_prefix = "RPC";
-
-// The `Status` type defines a logical error model that is suitable for
-// different programming environments, including REST APIs and RPC APIs. It is
-// used by [gRPC](https://github.com/grpc). Each `Status` message contains
-// three pieces of data: error code, error message, and error details.
-//
-// You can find out more about this error model and how to work with it in the
-// [API Design Guide](https://cloud.google.com/apis/design/errors).
-message Status {
-  // The status code, which should be an enum value of
-  // [google.rpc.Code][google.rpc.Code].
-  int32 code = 1;
-
-  // A developer-facing error message, which should be in English. Any
-  // user-facing error message should be localized and sent in the
-  // [google.rpc.Status.details][google.rpc.Status.details] field, or localized
-  // by the client.
-  string message = 2;
-
-  // A list of messages that carry the error details.  There is a common set of
-  // message types for APIs to use.
-  repeated google.protobuf.Any details = 3;
-}
diff --git a/proto/techmo/api/status.proto b/proto/techmo/api/status.proto
deleted file mode 100644
index 71a21ca..0000000
--- a/proto/techmo/api/status.proto
+++ /dev/null
@@ -1,20 +0,0 @@
-// Copyright 2023 Techmo sp. z o.o.
-
-syntax = "proto3";
-
-package techmo.api;
-
-import "google/protobuf/any.proto";
-
-
-//
-message Status {
-    //
-    int32 code = 1;
-
-    //
-    string message = 2;
-
-    //
-    repeated google.protobuf.Any details = 3;
-}
diff --git a/proto/techmo/asr/api/dictation/asr.proto b/proto/techmo/asr/api/dictation/asr.proto
deleted file mode 100644
index 2380707..0000000
--- a/proto/techmo/asr/api/dictation/asr.proto
+++ /dev/null
@@ -1,782 +0,0 @@
-// Copyright 2018 Google LLC.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Modified by Techmo, copyright by Google. Changes include:
-// 1. Additions that introduce new features to the original API. Extensions
-// (parts that were added to the original document) by Techmo are marked with
-// [**Extension by Techmo**] tag.
-//  - `MP3` audio encoding type.
-//  - `ConfigField` as means to provide additional configuration.
-//  - `ResultFinalizationCause` as means to indicate MRCPv2-related recognition
-//    result finalization cause.
-//  - `RecognitionLattice` and `LatticeEdge` as means to return detailed
-//    recognition results.
-//  - `Age` and `Gender` as means to provide age and gender recognition results
-//    in `SpeechRecognitionResult` and `StreamingRecognitionResult`.
-// 2. Modifications of comments, according to how recognition is performed by Techmo.
-//  - [*Unused*] tags for fields or values that are not used (ignored when
-//    provided in request, never returned in response).
-//  - [*Unsupported*] tags for fields or values that will result in an error
-//    when provided in request.
-// 3. Removal of `LongRunningRecognize` support (commented out).
-
-syntax = "proto3";
-
-package google.cloud.speech.v1;
-
-// import "google/api/annotations.proto";
-// import "google/longrunning/operations.proto";
-// import "google/protobuf/any.proto";
-import "google/protobuf/duration.proto";
-// import "google/protobuf/empty.proto";
-// import "google/protobuf/timestamp.proto";
-import "google/rpc/status.proto";
-
-option cc_enable_arenas = true;
-option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1;speech";
-option java_multiple_files = true;
-option java_outer_classname = "SpeechProto";
-option java_package = "com.google.cloud.speech.v1";
-
-
-// Service that implements Google Cloud Speech API extended by Techmo.
-service Speech {
-  // Performs synchronous speech recognition: receive results after all audio
-  // has been sent and processed.
-  rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {
-    // option (google.api.http) = {
-    //   post: "/v1/speech:recognize"
-    //   body: "*"
-    // };
-  }
-
-  // // Performs asynchronous speech recognition: receive results via the
-  // // google.longrunning.Operations interface. Returns either an
-  // // `Operation.error` or an `Operation.response` which contains
-  // // a `LongRunningRecognizeResponse` message.
-  // rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
-  //   option (google.api.http) = {
-  //     post: "/v1/speech:longrunningrecognize"
-  //     body: "*"
-  //   };
-  // }
-
-  // Performs bidirectional streaming speech recognition: receive results while
-  // sending audio. This method is only available via the gRPC API (not REST).
-  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
-  }
-}
-
-// The top-level message sent by the client for the `Recognize` method.
-message RecognizeRequest {
-  // [*Required*] Provides information to the recognizer that specifies how to
-  // process the request.
-  RecognitionConfig config = 1;
-
-  // [*Required*] The audio data to be recognized.
-  RecognitionAudio audio = 2;
-}
-
-// // The top-level message sent by the client for the `LongRunningRecognize`
-// // method.
-// message LongRunningRecognizeRequest {
-//   // [*Required*] Provides information to the recognizer that specifies how to
-//   // process the request.
-//   RecognitionConfig config = 1;
-
-//   // [*Required*] The audio data to be recognized.
-//   RecognitionAudio audio = 2;
-// }
-
-// The top-level message sent by the client for the `StreamingRecognize` method.
-// Multiple `StreamingRecognizeRequest` messages are sent. The first message
-// must contain a `streaming_config` message and must not contain `audio` data.
-// All subsequent messages must contain `audio` data and must not contain a
-// `streaming_config` message.
-message StreamingRecognizeRequest {
-  // The streaming request, which is either a streaming config or audio content.
-  oneof streaming_request {
-    // Provides information to the recognizer that specifies how to process the
-    // request. The first `StreamingRecognizeRequest` message must contain a
-    // `streaming_config`  message.
-    StreamingRecognitionConfig streaming_config = 1;
-
-    // The audio data to be recognized. Sequential chunks of audio data are sent
-    // in sequential `StreamingRecognizeRequest` messages. The first
-    // `StreamingRecognizeRequest` message must not contain `audio_content` data
-    // and all subsequent `StreamingRecognizeRequest` messages must contain
-    // `audio_content` data. The audio bytes must be encoded as specified in
-    // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
-    // pure binary representation (not base64).
-    bytes audio_content = 2;
-
-    // @exclude See [content limits](/speech-to-text/quotas#content).
-
-    // [**Extension by Techmo**]
-    // Another experimental feature from MRCPv2.
-    // https://www.rfc-editor.org/rfc/rfc6787.html#section-9.13
-    bool start_input_timers = 3;
-  }
-}
-
-// Provides information to the recognizer that specifies how to process the
-// request.
-message StreamingRecognitionConfig {
-  // [*Required*] Provides information to the recognizer that specifies how to
-  // process the request.
-  RecognitionConfig config = 1;
-
-  // [*Optional*] If `false` or omitted, the recognizer will perform continuous
-  // recognition (continuing to wait for and process audio even if the user
-  // pauses speaking) until the client closes the input stream (gRPC API) or
-  // until the maximum time limit has been reached. May return multiple
-  // `StreamingRecognitionResult`s with the `is_final` flag set to `true`.
-  // If `true`, the recognizer will detect a single spoken utterance. When it
-  // detects that the user has paused or stopped speaking, it will return an
-  // `END_OF_SINGLE_UTTERANCE` event and cease recognition. It will return no
-  // more than one `StreamingRecognitionResult` with the `is_final` flag set to
-  // `true`.
-  bool single_utterance = 2;
-
-  // [*Optional*] If `true`, interim results (tentative hypotheses) may be
-  // returned as they become available (these interim results are indicated with
-  // the `is_final=false` flag).
-  // If `false` or omitted, only `is_final=true` result(s) are returned.
-  bool interim_results = 3;
-
-  // [**Extension by Techmo**]
-  // Another experimental feature from MRCPv2.
-  // https://www.rfc-editor.org/rfc/rfc6787.html#section-9.4.14
-  optional bool start_input_timers = 4;
-}
-
-// Provides information to the recognizer that specifies how to process the
-// request.
-message RecognitionConfig {
-
-  // @exclude The encoding of the audio data sent in the request.
-  //
-  // All encodings support only 1 channel (mono) audio.
-  //
-  // For best results, the audio source should be captured and transmitted using
-  // a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
-  // recognition can be reduced if lossy codecs are used to capture or transmit
-  // audio, particularly if background noise is present. Lossy codecs include
-  // `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`.
-  //
-  // The `FLAC` and `WAV` audio file formats include a header that describes the
-  // included audio content. You can request recognition for `WAV` files that
-  // contain either `LINEAR16` or `MULAW` encoded audio.
-  // If you send `FLAC` or `WAV` audio file format in
-  // your request, you do not need to specify an `AudioEncoding`; the audio
-  // encoding format is determined from the file header. If you specify
-  // an `AudioEncoding` when you send  send `FLAC` or `WAV` audio, the
-  // encoding configuration must match the encoding described in the audio
-  // header; otherwise the request returns an
-  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
-
-  // The encoding of the audio data sent in the request.
-  // All encodings support only 1 channel (mono) audio.
-  enum AudioEncoding {
-    // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
-    ENCODING_UNSPECIFIED = 0;
-
-    // Uncompressed 16-bit signed little-endian samples (Linear PCM).
-    LINEAR16 = 1;
-
-    // `FLAC` (Free Lossless Audio
-    // Codec) is the recommended encoding because it is
-    // lossless--therefore recognition is not compromised--and
-    // requires only about half the bandwidth of `LINEAR16`. `FLAC` stream
-    // encoding supports 16-bit and 24-bit samples, however, not all fields in
-    // `STREAMINFO` are supported.
-    // [**Extension by Techmo**] Supported only by `Recognize`. When requested by `StreamingRecognize`, will return result [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead.
-    FLAC = 2;
-
-    // @exclude 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
-
-    // [*Unsupported*]
-    MULAW = 3;
-
-    // @exclude Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
-
-    // [*Unsupported*]
-    AMR = 4;
-
-    // @exclude Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
-
-    // [*Unsupported*]
-    AMR_WB = 5;
-
-    // Opus encoded audio frames in Ogg container
-    // ([OggOpus](https://wiki.xiph.org/OggOpus)).
-    // [**Extension by Techmo**] Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead.
-    OGG_OPUS = 6;
-
-    // @exclude `sample_rate_hertz` must be one of 8000, 12000, 16000, 24000, or 48000.
-
-    // @exclude Although the use of lossy encodings is not recommended, if a very low
-    // bitrate encoding is required, `OGG_OPUS` is highly preferred over
-    // Speex encoding. The [Speex](https://speex.org/)  encoding supported by
-    // Cloud Speech API has a header byte in each block, as in MIME type
-    // `audio/x-speex-with-header-byte`.
-    // It is a variant of the RTP Speex encoding defined in
-    // [RFC 5574](https://tools.ietf.org/html/rfc5574).
-    // The stream is a sequence of blocks, one block per RTP packet. Each block
-    // starts with a byte containing the length of the block, in bytes, followed
-    // by one or more frames of Speex data, padded to an integral number of
-    // bytes (octets) as specified in RFC 5574. In other words, each RTP header
-    // is replaced with a single byte containing the block length. Only Speex
-    // wideband is supported. `sample_rate_hertz` must be 16000.
-
-    // [*Unsupported*]
-    SPEEX_WITH_HEADER_BYTE = 7;
-
-    // [**Extension by Techmo**] `MP3` (standards ISO/IEC 11172-3 and ISO/IEC 13818-3) Only constant bit rate files are accepted. Silently ignores `sample_rate_hertz` and detects real sample rate from file header instead.
-    MP3 = 8;
-  }
-
-  // [*Required*] Encoding of audio data sent in all `RecognitionAudio` messages.
-  AudioEncoding encoding = 1;
-
-  // @exclude Encoding of audio data sent in all `RecognitionAudio` messages.
-  // This field is optional for `FLAC` and `WAV` audio files and required
-  // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
-
-  // [*Required*] Sample rate in Hertz of the audio data sent in all
-  // `RecognitionAudio` messages. Valid values are: 8000-48000.
-  // 16000 is optimal. For best results, set the sampling rate of the audio
-  // source to 16000 Hz. If that's not possible, use the native sample rate of
-  // the audio source (instead of re-sampling).
-  // [**Extension by Techmo**] Silently ignored for `FLAC`, `OGG_OPUS` and `MP3` encodings. Real sample rate will be detected from file header instead.
-  int32 sample_rate_hertz = 2;
-
-  // @exclude This field is optional for `FLAC` and `WAV` audio files and required
-  // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
-
-  // [*Required*] The language of the supplied audio as a
-  // [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
-  // Example: "en-US".
-  // The only language supported at the moment is Polish (`pl-PL`).
-  string language_code = 3;
-
-  // @exclude See [Language Support](/speech-to-text/docs/languages)
-  // for a list of the currently supported language codes.
-
-  // [*Optional*] Maximum number of recognition hypotheses to be returned.
-  // Specifically, the maximum number of `SpeechRecognitionAlternative` messages
-  // within each `SpeechRecognitionResult`.
-  // The server may return fewer than `max_alternatives`.
-  // Valid values are `0`-`30`. A value of `0` or `1` will return a maximum of
-  // one. If omitted, will return a maximum of one.
-  int32 max_alternatives = 4;
-
-  // @exclude [*Optional*] If set to `true`, the server will attempt to filter out
-  // profanities, replacing all but the initial character in each filtered word
-  // with asterisks, e.g. "f***". If set to `false` or omitted, profanities
-  // won't be filtered out.
-
-  // [*Optional*][*Unused*]
-  bool profanity_filter = 5;
-
-  // @exclude [*Optional*] array of [SpeechContext][google.cloud.speech.v1.SpeechContext].
-  // First element of the array is used to identify context model to be used
-  // in current recognition.
-
-  // [*Optional*]
-  repeated SpeechContext speech_contexts = 6;
-
-  // [*Optional*] If `true`, the top result includes a list of words and
-  // the start and end time offsets (timestamps) for those words. If
-  // `false`, no word-level time offset information is returned. The default is
-  // `false`.
-  bool enable_word_time_offsets = 8;
-
-  // @exclude [*Optional*] If 'true', adds punctuation to recognition result hypotheses.
-  // This feature is only available in select languages. Setting this for
-  // requests in other languages has no effect at all.
-  // The default 'false' value does not add punctuation to result hypotheses.
-  // Note: This is currently offered as an experimental service, complimentary
-  // to all users. In the future this may be exclusively available as a
-  // premium feature.
-
-  // [*Optional*][*Unused*]
-  bool enable_automatic_punctuation = 11;
-
-  // [**Extension by Techmo**]
-  // [*Optional*] A means to provide additional configuration fields via request.
-  repeated ConfigField config_fields = 12;
-
-  // @exclude [*Optional*] Which model to select for the given request. Select the model
-  // best suited to your domain to get best results. If a model is not
-  // explicitly specified, then we auto-select a model based on the parameters
-  // in the RecognitionConfig.
-  // <table>
-  //   <tr>
-  //     <td><b>Model</b></td>
-  //     <td><b>Description</b></td>
-  //   </tr>
-  //   <tr>
-  //     <td><code>command_and_search</code></td>
-  //     <td>Best for short queries such as voice commands or voice search.</td>
-  //   </tr>
-  //   <tr>
-  //     <td><code>phone_call</code></td>
-  //     <td>Best for audio that originated from a phone call (typically
-  //     recorded at an 8khz sampling rate).</td>
-  //   </tr>
-  //   <tr>
-  //     <td><code>video</code></td>
-  //     <td>Best for audio that originated from from video or includes multiple
-  //         speakers. Ideally the audio is recorded at a 16khz or greater
-  //         sampling rate. This is a premium model that costs more than the
-  //         standard rate.</td>
-  //   </tr>
-  //   <tr>
-  //     <td><code>default</code></td>
-  //     <td>Best for audio that is not one of the specific audio models.
-  //         For example, long-form audio. Ideally the audio is high-fidelity,
-  //         recorded at a 16khz or greater sampling rate.</td>
-  //   </tr>
-  // </table>
-
-  // [*Optional*][*Unused*]
-  string model = 13;
-
-  // @exclude [*Optional*] Set to true to use an enhanced model for speech recognition.
-  // You must also set the `model` field to a valid, enhanced model. If
-  // `use_enhanced` is set to true and the `model` field is not set, then
-  // `use_enhanced` is ignored. If `use_enhanced` is true and an enhanced
-  // version of the specified model does not exist, then the speech is
-  // recognized using the standard version of the specified model.
-  //
-  // Enhanced speech models require that you opt-in to data logging using
-  // instructions in the [documentation](/speech-to-text/enable-data-logging).
-  // If you set `use_enhanced` to true and you have not enabled audio logging,
-  // then you will receive an error.
-
-  // [*Optional*][*Unused*]
-  bool use_enhanced = 14;
-
-  // [**Extension by Techmo**][*Optional*] Gender and age recognition parameters
-  SpeechDurationConfig speech_duration_gender_recognition = 15;
-
-  // [**Extension by Techmo**][*Optional*] Gender and age recognition parameters
-  SpeechDurationConfig speech_duration_age_recognition = 16;
-}
-
-// [**Extension by Techmo**] Gender and age recognition parameters.
-message SpeechDurationConfig {
-  // The way in which service decides when to start recognition.
-  SpeechDurationThresholdMode speech_duration_threshold_mode = 3;
-
-  // The minimum duration of speech in `audio` required to start recognition, in ms.
-  // Ignored, unless `speech_duration_threshold_mode` is `CUSTOM`.
-  uint32 speech_duration_threshold_ms = 4;
-}
-
-// [**Extension by Techmo**]
-// The possible ways for a service to decide when to start recognition
-// depending on a duration of speech in `audio`.
-enum SpeechDurationThresholdMode {
-  // Use an implementation-defined threshold value carefully tuned to obtain best results.
-  DEFAULT = 0;
-
-  // Use an user-defined threshold value provided in the configuration message.
-  CUSTOM = 1;
-
-  // Disable early start of recognition and wait for the entire audio data.
-  DISABLED = 2;
-}
-
-
-// @exclude Provides "hints" to the speech recognizer to favor specific words and phrases
-// in the results.
-
-message SpeechContext {
-  // @exclude [*Optional*] Can be used to send a context phrase that switches the model
-  // used during recognition. If the phrase correctly identifies the context model
-  // used in service, it will be used instead of the general model for the current recognition.
-  // Due to compatibility with Google API, the object is defined as a list of strings,
-  // but only the first element of the list is used as the context phrase,
-  // the rest are ignored if present.
-
-  repeated string phrases = 1;
-}
-
-// [**Extension by Techmo**]
-// Provides a pair of configuration field name and value.
-message ConfigField {
-  // Name of configuration field.
-  string key = 1;
-
-  // Value of configuration field.
-  string value = 2;
-}
-
-// @exclude Contains audio data in the encoding specified in the `RecognitionConfig`.
-// Either `content` or `uri` must be supplied. Supplying both or neither
-// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
-// [audio limits](https://cloud.google.com/speech/limits#content).
-
-// Contains audio data in the encoding specified in the `RecognitionConfig`.
-// Only `content` is allowed to be supplied.
-message RecognitionAudio {
-
-  // @exclude The audio source, which is either inline content or a Google Cloud
-  // Storage uri.
-
-  // The audio source, which is inline content.
-  oneof audio_source {
-    // The audio data bytes encoded as specified in
-    // `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
-    // pure binary representation, whereas JSON representations use base64.
-    bytes content = 1;
-
-    // @exclude URI that points to a file that contains audio data bytes as specified in
-    // `RecognitionConfig`. The file must not be compressed (for example, gzip).
-    // Currently, only Google Cloud Storage URIs are
-    // supported, which must be specified in the following format:
-    // `gs://bucket_name/object_name` (other URI formats return
-    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
-    // [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
-
-    // [*Unsupported*]
-    string uri = 2;
-  }
-}
-
-// The only message returned to the client by the `Recognize` method. It
-// contains the result as zero or more sequential `SpeechRecognitionResult`
-// messages.
-message RecognizeResponse {
-  // [*Output only*] Sequential list of transcription results corresponding to
-  // sequential portions of audio.
-  repeated SpeechRecognitionResult results = 2;
-}
-
-// // The only message returned to the client by the `LongRunningRecognize` method.
-// // It contains the result as zero or more sequential `SpeechRecognitionResult`
-// // messages. It is included in the `result.response` field of the `Operation`
-// // returned by the `GetOperation` call of the `google::longrunning::Operations`
-// // service.
-// message LongRunningRecognizeResponse {
-//   // [*Output only*] Sequential list of transcription results corresponding to
-//   // sequential portions of audio.
-//   repeated SpeechRecognitionResult results = 2;
-// }
-
-// // Describes the progress of a long-running `LongRunningRecognize` call. It is
-// // included in the `metadata` field of the `Operation` returned by the
-// // `GetOperation` call of the `google::longrunning::Operations` service.
-// message LongRunningRecognizeMetadata {
-//   // Approximate percentage of audio processed thus far. Guaranteed to be 100
-//   // when the audio is fully processed and the results are available.
-//   int32 progress_percent = 1;
-
-//   // Time when the request was received.
-//   google.protobuf.Timestamp start_time = 2;
-
-//   // Time of the most recent processing update.
-//   google.protobuf.Timestamp last_update_time = 3;
-// }
-
-// `StreamingRecognizeResponse` is the only message returned to the client by
-// `StreamingRecognize`. A series of one or more `StreamingRecognizeResponse`
-// messages are streamed back to the client.
-//
-// Here's an example of a series of ten `StreamingRecognizeResponse`s that might
-// be returned while processing audio:
-//
-// 1. results { alternatives { transcript: "tube" } stability: 0.01 }
-//
-// 2. results { alternatives { transcript: "to be a" } stability: 0.01 }
-//
-// 3. results { alternatives { transcript: "to be" } stability: 0.9 }
-//    results { alternatives { transcript: " or not to be" } stability: 0.01 }
-//
-// 4. results { alternatives { transcript: "to be or not to be"
-//                             confidence: 0.92 }
-//              alternatives { transcript: "to bee or not to bee" }
-//              is_final: true }
-//
-// 5. results { alternatives { transcript: " that's" } stability: 0.01 }
-//
-// 6. results { alternatives { transcript: " that is" } stability: 0.9 }
-//    results { alternatives { transcript: " the question" } stability: 0.01 }
-//
-// 7. results { alternatives { transcript: " that is the question"
-//                             confidence: 0.98 }
-//              alternatives { transcript: " that was the question" }
-//              is_final: true }
-//
-// Notes:
-//
-// - Only two of the above responses #4 and #7 contain final results; they are
-//   indicated by `is_final: true`. Concatenating these together generates the
-//   full transcript: "to be or not to be that is the question".
-//
-// - The others contain interim `results`. #3 and #6 contain two interim
-//   `results`: the first portion has a high stability and is less likely to
-//   change; the second portion has a low stability and is very likely to
-//   change. A UI designer might choose to show only high stability `results`.
-//
-// - The specific `stability` and `confidence` values shown above are only for
-//   illustrative purposes. Actual values may vary.
-//
-// - In each response, only one of these fields will be set:
-//     `error`,
-//     `speech_event_type`, or
-//     one or more (repeated) `results`.
-message StreamingRecognizeResponse {
-  // Indicates the type of speech event.
-  enum SpeechEventType {
-    // No speech event specified.
-    SPEECH_EVENT_UNSPECIFIED = 0;
-
-    // This event indicates that the server has detected the end of the user's
-    // speech utterance and expects no additional speech. Therefore, the server
-    // will not process additional audio (although it may subsequently return
-    // additional results). The client should stop sending additional audio
-    // data, half-close the gRPC connection, and wait for any additional results
-    // until the server closes the gRPC connection. This event is only sent if
-    // `single_utterance` was set to `true`, and is not used otherwise.
-    END_OF_SINGLE_UTTERANCE = 1;
-  }
-
-  // [*Output only*] If set, returns a [google.rpc.Status][google.rpc.Status] message that
-  // specifies the error for the operation.
-  google.rpc.Status error = 1;
-
-  // [*Output only*] This repeated list contains zero or more results that
-  // correspond to consecutive portions of the audio currently being processed.
-  // It contains zero or one `is_final=true` result (the newly settled portion),
-  // followed by zero or more `is_final=false` results (the interim results).
-  repeated StreamingRecognitionResult results = 2;
-
-  // [*Output only*] Indicates the type of speech event.
-  SpeechEventType speech_event_type = 4;
-}
-
-// A streaming speech recognition result corresponding to a portion of the audio
-// that is currently being processed.
-message StreamingRecognitionResult {
-  // [**Extension by Techmo**]
-  // Indicates the cause of recognition result finalization. These are MRCPv2-related.
-  // See [Completion-Cause](https://tools.ietf.org/html/rfc6787#section-9.4.11).
-  enum ResultFinalizationCause {
-    // No recognition result finalization cause specified.
-    RESULT_FINALIZATION_CAUSE_UNSPECIFIED = 0;
-
-    // Recognition has been finalized with a complete result
-    // after specified length of silence after user speech.
-    // See [Speech-Complete-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.15).
-    SUCCESS = 1;
-
-    // Recognition has started and there was no speech detected
-    // for a certain period of time.
-    // See [No-Input-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.6).
-    NO_INPUT_TIMEOUT = 2;
-
-    // Recognition has been finalized because speech was too long, with a complete result.
-    // See [Recognition-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.7).
-    SUCCESS_MAXTIME = 3;
-
-    // Recognition has been finalized with an incomplete result
-    // after specified length of silence after user speech.
-    // See [Speech-Incomplete-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.16).
-    PARTIAL_MATCH = 4;
-
-    // Recognition has been finalized because speech was too long, with no result.
-    // See [Recognition-Timeout](https://tools.ietf.org/html/rfc6787#section-9.4.7).
-    NO_MATCH_MAXTIME = 5;
-  }
-
-  // [*Output only*] May contain one or more recognition hypotheses (up to the
-  // maximum specified in `max_alternatives`).
-  // These alternatives are ordered in terms of accuracy, with the top (first)
-  // alternative being the most probable, as ranked by the recognizer.
-  repeated SpeechRecognitionAlternative alternatives = 1;
-
-  // [*Output only*] If `false`, this `StreamingRecognitionResult` represents an
-  // interim result that may change. If `true`, this is the final time the
-  // speech service will return this particular `StreamingRecognitionResult`,
-  // the recognizer will not return any further hypotheses for this portion of
-  // the transcript and corresponding audio.
-  bool is_final = 2;
-
-  // @exclude [*Output only*] An estimate of the likelihood that the recognizer will not
-  // change its guess about this interim result. Values range from 0.0
-  // (completely unstable) to 1.0 (completely stable).
-  // This field is only provided for interim results (`is_final=false`).
-  // The default of 0.0 is a sentinel value indicating `stability` was not set.
-
-  // [*Unused*]
-  float stability = 3;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Indicates the cause of recognition result finalization.
-  ResultFinalizationCause result_finalization_cause = 4;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Detailed recognition result (lattice).
-  // Returned only when requested (`ConfigField`: return-lattice=true in
-  // `RecognitionConfig` Message), only for final (`is_final = true`) results,
-  // and only when it's allowed by licence.
-  // When requested and not allowed by licence, [google.rpc.Code.FAILED_PRECONDITION]
-  // will be returned.
-  repeated RecognitionLattice lattice = 5;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Predicted gender of the speaker
-  Gender gender = 6;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Predicted age of the speaker
-  Age age = 7;
-}
-
-// A speech recognition result corresponding to a portion of the audio.
-message SpeechRecognitionResult {
-  // [*Output only*] May contain one or more recognition hypotheses (up to the
-  // maximum specified in `max_alternatives`).
-  // These alternatives are ordered in terms of accuracy, with the top (first)
-  // alternative being the most probable, as ranked by the recognizer.
-  repeated SpeechRecognitionAlternative alternatives = 1;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Detailed recognition result (lattice).
-  // Returned only when requested (`ConfigField`: return-lattice=true in
-  // `RecognitionConfig` Message), only for final (`is_final = true`) results,
-  // and only when it's allowed by licence.
-  // When requested and not allowed by licence, [google.rpc.Code.FAILED_PRECONDITION]
-  // will be returned.
-  repeated RecognitionLattice lattice = 5;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Predicted gender of the speaker
-  Gender gender = 6;
-
-  // [**Extension by Techmo**]
-  // [*Output only*] Predicted age of the speaker
-  Age age = 7;
-}
-
-// Alternative hypotheses (a.k.a. n-best list).
-message SpeechRecognitionAlternative {
-  // [*Output only*] Transcript text representing the words that the user spoke.
-  string transcript = 1;
-
-  // [*Output only*] The confidence estimate between 0.0 and 1.0. A higher number
-  // indicates an estimated greater likelihood that the recognized words are
-  // correct.
-  float confidence = 2;
-
-  // @exclude This field is set only for the top alternative of a non-streaming
-  // result or, of a streaming result where `is_final=true`.
-  // This field is not guaranteed to be accurate and users should not rely on it
-  // to be always provided.
-  // The default of 0.0 is a sentinel value indicating `confidence` was not set.
-
-  // [*Output only*] A list of word-specific information for each recognized word.
-  repeated WordInfo words = 3;
-}
-
-// Word-specific information for recognized words. Word information is only
-// included in the response when certain request parameters are set, such
-// as `enable_word_time_offsets`.
-message WordInfo {
-  // @exclude [*Output only*] Time offset relative to the beginning of the audio,
-  // and corresponding to the start of the spoken word.
-  // This field is only set if `enable_word_time_offsets=true` and only
-  // in the top hypothesis.
-  // This is an experimental feature and the accuracy of the time offset can
-  // vary.
-
-  // [*Output only*] Time offset relative to the beginning of the audio,
-  // and corresponding to the start of the spoken word.
-  // This field is only set if `enable_word_time_offsets=true`.
-  google.protobuf.Duration start_time = 1;
-
-  // @exclude [*Output only*] Time offset relative to the beginning of the audio,
-  // and corresponding to the end of the spoken word.
-  // This field is only set if `enable_word_time_offsets=true` and only
-  // in the top hypothesis.
-  // This is an experimental feature and the accuracy of the time offset can
-  // vary.
-
-  // [*Output only*] Time offset relative to the beginning of the audio,
-  // and corresponding to the end of the spoken word.
-  // This field is only set if `enable_word_time_offsets=true`.
-  google.protobuf.Duration end_time = 2;
-
-  // [*Output only*] The word corresponding to this set of information.
-  string word = 3;
-}
-
-// [**Extension by Techmo**]
-// Detailed recognition result (lattice).
-// Returned only when requested (`ConfigField`: return-lattice=true in
-// `RecognitionConfig` Message), only for final (`is_final = true`) results,
-// and only when it's allowed by licence. When requested and not allowed by
-// licence, [google.rpc.Code.FAILED_PRECONDITION] will be returned.
-message RecognitionLattice {
-  // List of final nodes.
-  repeated int32 final_nodes = 1;
-
-  // List of lattice edges.
-  repeated LatticeEdge edges = 2;
-}
-
-// [**Extension by Techmo**]
-// Edge-specific information for recognition lattice.
-message LatticeEdge {
-  // Input node ID, node '0' is starting node for the lattice.
-  int32 start_node = 1;
-
-  // End node ID.
-  int32 end_node = 2;
-
-  // Word.
-  string symbol = 3;
-
-  // Language model cost.
-  float language_cost = 4;
-
-  // Raw acoustic score (unscaled).
-  float acoustic_cost = 5;
-
-  // Word duration in milliseconds.
-  int32 duration = 6;
-}
-
-// [**Extension by Techmo**]
-// Predicted gender of the speaker
-message Gender {
-  // The recognized gender label.
-  string gender = 1;
-
-  // The confidence in [0, 1] range, where near 0 means 'unsure' and near 1 means 'almost certain'.
-  float confidence = 2;
-}
-
-// [**Extension by Techmo**]
-// Predicted age of the speaker
-message Age {
-  // The recognized age, in years.
-  int32 age = 1;
-
-  // The confidence in [0, 1] range, where near 0 means 'unsure' and near 1 means 'almost certain'.
-  float confidence = 2;
-}
diff --git a/proto/techmo/asr/api/v1/asr.proto b/proto/techmo/asr/api/v1/asr.proto
deleted file mode 100644
index a83f414..0000000
--- a/proto/techmo/asr/api/v1/asr.proto
+++ /dev/null
@@ -1,436 +0,0 @@
-// Copyright 2023 Techmo sp. z o.o.
-
-syntax = "proto3";
-
-package techmo.asr.api.v1;
-
-import "google/protobuf/duration.proto";
-import "techmo/api/status.proto";
-
-
-// An automatic speech recognition (ASR) service providing a solution for
-// speech-to-text conversion extended by the assessment of additional speech
-// and speaker features.
-service Asr {
-  // Perform bidirectional streaming recognition.
-  rpc StreamingRecognize(stream StreamingRecognizeRequest)
-    returns (stream StreamingRecognizeResponse) {}
-}
-
-// A message streamed from the client through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeRequest {
-  oneof request_content {
-    // The immutable initial configuration of the request.
-    // Must be sent once in the request's first message.
-    StreamingRecognizeRequestConfig config = 1;
-
-    // The message controlling the processing flow of the request.
-    // May be sent multiple times except in the request's first message.
-    StreamingRecognizeRequestControlMessage control_message = 2;
-
-    // The data contents of the request itself.
-    // May be sent multiple times except in the request's first message.
-    StreamingRecognizeRequestData data = 3;
-  }
-}
-
-// A message holding configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestConfig {
-  // Part of the configuration for the request's audio content.
-  AudioConfig audio_config = 1;
-
-  // Part of the configuration for the request's result form.
-  ResultConfig result_config = 2;
-
-  // Part of the configuration for the request's processing flow.
-  StreamingConfig streaming_config = 3;
-
-  // Part of the configuration for speech recognition.
-  SpeechRecognitionConfig speech_recognition_config = 4;
-
-  // Part of the configuration for age recognition.
-  AgeRecognitionConfig age_recognition_config = 5;
-
-  // Part of the configuration for gender recognition.
-  GenderRecognitionConfig gender_recognition_config = 6;
-
-  // Part of the configuration for language recognition.
-  LanguageRecognitionConfig language_recognition_config = 7;
-}
-
-// Result configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message ResultConfig {
-  // The switch that toggles continuous recognition into single utterance mode.
-  // The service returns a final result for each end of utterance it detects in
-  // the audio, which may occur multiple times during a request.
-  // If enabled, the request terminates right after its first final result.
-  bool enable_single_utterance = 1;
-
-  // The switch that allows interim results.
-  // If enabled, results containing tentative hypotheses may be returned in
-  // addition to final ones.
-  // The service should silently ignore this field if it is unsupported.
-  bool enable_interim_results = 2;
-}
-
-// Streaming configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingConfig {
-  reserved 1; // bool enable_single_utterance = 1;
-
-  // The switch that enables manual control of the input timer.
-  // The timer imposes two constraints: one that finalizes recognition after
-  // a specified period unless speech is detected, and the other that limits
-  // the total time for an utterance. Manual control allows recognition to
-  // begin but delays enforcement of these constraints. The timer restarts
-  // after each detected end of utterance (each final result).
-  // If enabled, the timer does not start automatically. Instead, it can be
-  // initiated by sending
-  // a [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage)
-  // with the `start_input_timer` field set to `true` as needed. This should
-  // occur after the beginning of the request and be repeated after each final
-  // result.
-  bool enable_manual_input_timer = 2;
-}
-
-// Audio configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message AudioConfig {
-  // The possible audio encodings.
-  enum AudioEncoding {
-    // Unspecified audio encoding.
-    UNSPECIFIED = 0;
-
-    // Linear pulse-code modulation of uncompressed 16-bit signed little-endian
-    // samples.
-    LINEAR16 = 1;
-
-    // Free Lossless Audio Codec ([FLAC](https://wiki.xiph.org/FLAC)).
-    // The encoding requires only about half the bandwidth of `LINEAR16`.
-    // 16-bit and 24-bit samples. Not all fields in `STREAMINFO` are supported.
-    // When set, the service ignores the `sampling_rate_hz` field and detects
-    // the actual value from audio header instead.
-    FLAC = 2;
-
-    // Ogg Encapsulated Opus Audio Codec ([OggOpus](https://wiki.xiph.org/OggOpus)).
-    // When set, the service ignores the `sampling_rate_hz` field and detects
-    // the actual value from audio header instead.
-    OGG_OPUS = 6;
-
-    // MP3 (ISO/IEC 11172-3 and ISO/IEC 13818-3).
-    // Only constant bitrate.
-    // When set, the service ignores the `sampling_rate_hz` field and detects
-    // the actual value from audio header instead.
-    MP3 = 8;
-  }
-
-  // The encoding of the audio data sent in the request. Single channel (mono)
-  // audio is assumed.
-  // The service should respond with the `INVALID_ARGUMENT` gRPC status code
-  // if the encoding is `UNSPECIFIED`.
-  // The service should respond with the `FAILED_PRECONDITION` gRPC status code
-  // if the encoding is not supported.
-  AudioEncoding encoding = 1;
-
-  // The sampling rate of the audio data sent in the request.
-  // The service should silently ignore the field for encodings that are sent
-  // along wtih headers, and detect the value from them instead.
-  // The service should respond with the `INVALID_ARGUMENT` gRPC status code
-  // if the value is not greater than 0.
-  float sampling_rate_hz = 2;
-}
-
-// Configuration of age recognition.
-message AgeRecognitionConfig {
-  // The switch that enables age recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_age_recognition = 1;
-}
-
-// Configuration of gender recognition.
-message GenderRecognitionConfig {
-  // The switch that enables gender recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_gender_recognition = 1;
-}
-
-// Configuration of language recognition.
-message LanguageRecognitionConfig {
-  // The switch that enables language recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_language_recognition = 1;
-}
-
-// Configuration for speech recognition.
-message SpeechRecognitionConfig {
-  // The switch that enables speech recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_speech_recognition = 1;
-
-  // The maximum number of alternative transcriptions allowed to be included
-  // per response.
-  // The actual count received can be less than the specified value and may
-  // also be equal to 0. If unspecified or 0, one alternative is allowed to be
-  // returned too.
-  uint32 recognition_alternatives_limit = 2;
-
-  // The switch that enables additional time alignment of recognitions in word
-  // details.
-  // If enabled, the `words` field of
-  // a [`SpeechRecognitionAlternative`](#SpeechRecognitionAlternative) message
-  // includes a list of [`SpeechRecognitionWord`](#SpeechRecognitionWord)
-  // messages. Otherwise, it remains empty.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_time_alignment = 3;
-
-  // The name of a language group of models to be used.
-  // If left unspecified, it backs to the service's default group.
-  // The service responds with the `NOT_FOUND` gRPC status code
-  // if the name is not registered.
-  string language_group_name = 4;
-
-  // The name of a model to be used.
-  // If left unspecified, it backs to the selected langugage group's default.
-  // The service responds with the `NOT_FOUND` gRPC status code
-  // if the name is not registered.
-  string model_name = 5;
-
-  // Deprecated.
-  // The additional advanced service-dependend configuration for its speech
-  // recognizer. It may be silently ignored.
-  map<string, string> config_fields = 6;
-}
-
-// A message controlling the processing flow of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestControlMessage {
-  // The flag that starts the input timer on demand and resets after each final
-  // result. It is silently ignored if the manual input timer setting is
-  // disabled for the request.
-  optional bool start_input_timer = 1;
-}
-
-// A message that carries data contents of
-// a [`StreamingRecognizeRequest`](#StreamingRecognizeRequest) request.
-message StreamingRecognizeRequestData {
-  // Part of the audio to perform recognition on.
-  Audio audio = 1;
-}
-
-// Audio contents.
-message Audio {
-  oneof audio_content {
-    // The audio data bytes.
-    bytes bytes = 1;
-  }
-}
-
-// A message streamed from the service through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeResponse {
-  // The combined recognition results for another part of the audio.
-  StreamingRecognizeResult result = 1;
-
-  // The cumulative duration of the processed audio during the request,
-  // not necessarily matching the actual length of the sent audio, mandatorily
-  // updated with each final result.
-  google.protobuf.Duration processed_audio_duration = 2;
-}
-
-// Combined recognition result.
-message StreamingRecognizeResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The flag indicating whether the result is interim or final.
-  bool is_final = 2;
-
-  // The anticipated causes for the service to finalize a result.
-  enum ResultFinalizationCause {
-    // The cause is not specified.
-    UNSPECIFIED = 0;
-
-    // The speech recognition result is not empty and the end of utterance
-    // is detected.
-    SUCCESS = 1;
-
-    // The speech recognition result is empty after the duration to expect
-    // a result is reached.
-    NO_INPUT_TIMEOUT = 2;
-
-    // The speech recognition result is not empty after the utterance duration
-    // limit is reached. The returned speech recognition is incomplete and
-    // should be completed in the following result.
-    SUCCESS_MAXTIME = 3;
-
-    // Unused.
-    PARTIAL_MATCH = 4;
-
-    // The speech recognition result is empty after the utterance duration
-    // limit is reached.
-    NO_MATCH_MAXTIME = 5;
-  }
-
-  // The field indicating the cause of result finalization.
-  // For interim results, the service should leave the field as `UNSPECIFIED`.
-  // For final results, the service must set the field to a value other than
-  // `UNSPECIFIED`.
-  ResultFinalizationCause result_finalization_cause = 3;
-
-  // The speech recognition result for another part of the processed audio,
-  // new with each final result, updates with each interim one.
-  // To obtain a complete result for all processed audio, for each final result
-  // received, a client should pick one of the result's recognition alternatives
-  // and buffer it on its own.
-  // It must be omitted if speech recognition is disabled.
-  SpeechRecognitionResult speech_recognition_result = 4;
-
-  // The current age recognition result for all processed audio,
-  // updated with each final result.
-  // It may be omitted in an interim result and must be omitted if age
-  // recognition is disabled.
-  AgeRecognitionResult age_recognition_result = 5;
-
-  // The current gender recognition result for all processed audio,
-  // updated with each final result.
-  // It may be omitted in an interim result and must be omitted if gender
-  // recognition is disabled.
-  GenderRecognitionResult gender_recognition_result = 6;
-
-  // The current language recognition result for all processed audio,
-  // updated with each final result.
-  // It may be omitted in an interim result and must be omitted if language
-  // recognition is disabled.
-  LanguageRecognitionResult language_recognition_result = 7;
-}
-
-// A result of age recognition.
-message AgeRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated AgeRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of age recognition.
-message AgeRecognitionAlternative {
-  // The assumed age of the person speaking in the audio, in years.
-  // For a reliable value, assure that there is only one person speaking in
-  // the audio.
-  uint32 age = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-}
-
-// A result of gender recognition.
-message GenderRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated GenderRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of gender recognition.
-message GenderRecognitionAlternative {
-  // The assumed gender of the person speaking in the audio.
-  // For a reliable value, assure that there is only one person speaking in
-  // the audio.
-  string gender = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-}
-
-// A result of language recognition.
-message LanguageRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated LanguageRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of language recognition.
-message LanguageRecognitionAlternative {
-  // The language spoken in the audio,
-  // a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) tag.
-  string language = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-}
-
-// A result of speech recognition.
-message SpeechRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated SpeechRecognitionAlternative recognition_alternatives = 2;
-
-  // The actual name of the language group of the model,
-  // unrelated to the actual language spoken in the audio.
-  string language_group_name = 3;
-
-  // The actual name of the model used to obtain the result.
-  string model_name = 4;
-}
-
-// An alternative hypothesis of speech recognition.
-message SpeechRecognitionAlternative {
-  // The transcript of the audio.
-  string transcript = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-
-  // The details of the transcript's words.
-  // Empty unless `enable_time_alignment` is `true` in the request's
-  // [`SpeechRecognitionConfig`](#SpeechRecognitionConfig).
-  repeated SpeechRecognitionWord words = 3;
-}
-
-// Details of a single word in speech recognition.
-message SpeechRecognitionWord {
-  // The transcript of the word itself.
-  string transcript = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-
-  // The start time of the word relative to the beginning of the entire audio.
-  google.protobuf.Duration start_time = 3;
-
-  // The end time of the word relative to the beginning of the entire audio.
-  google.protobuf.Duration end_time = 4;
-}
diff --git a/proto/techmo/asr/api/v1p1/asr.proto b/proto/techmo/asr/api/v1p1/asr.proto
deleted file mode 100644
index f2af15a..0000000
--- a/proto/techmo/asr/api/v1p1/asr.proto
+++ /dev/null
@@ -1,481 +0,0 @@
-// Copyright 2023 Techmo sp. z o.o.
-
-syntax = "proto3";
-
-package techmo.asr.api.v1p1;
-
-import "google/protobuf/duration.proto";
-import "techmo/api/status.proto";
-
-
-// An automatic speech recognition (ASR) service providing a solution for
-// speech-to-text conversion extended by the assessment of additional speech
-// and speaker features.
-service Asr {
-  // Perform bidirectional streaming recognition.
-  rpc StreamingRecognize(stream StreamingRecognizeRequest)
-    returns (stream StreamingRecognizeResponse) {}
-}
-
-// A message streamed from the client through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeRequest {
-  oneof request_content {
-    // The immutable initial configuration of the request.
-    // Must be sent once in the request's first message.
-    StreamingRecognizeRequestConfig config = 1;
-
-    // The message controlling the processing flow of the request.
-    // May be sent multiple times except in the request's first message.
-    StreamingRecognizeRequestControlMessage control_message = 2;
-
-    // The data contents of the request itself.
-    // May be sent multiple times except in the request's first message.
-    StreamingRecognizeRequestData data = 3;
-  }
-}
-
-// A message holding configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestConfig {
-  // Part of the configuration for the request's audio content.
-  AudioConfig audio_config = 1;
-
-  // Part of the configuration for the request's result form.
-  ResultConfig result_config = 2;
-
-  // Part of the configuration for the request's processing flow.
-  StreamingConfig streaming_config = 3;
-
-  // Part of the configuration for speech recognition.
-  SpeechRecognitionConfig speech_recognition_config = 4;
-
-  // Part of the configuration for age recognition.
-  AgeRecognitionConfig age_recognition_config = 5;
-
-  // Part of the configuration for gender recognition.
-  GenderRecognitionConfig gender_recognition_config = 6;
-
-  // Part of the configuration for language recognition.
-  LanguageRecognitionConfig language_recognition_config = 7;
-}
-
-// Result configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message ResultConfig {
-  // The switch that toggles continuous recognition into single utterance mode.
-  // The service returns a final result for each end of utterance it detects in
-  // the audio, which may occur multiple times during a request.
-  // If enabled, the request terminates right after its first final result.
-  bool enable_single_utterance = 1;
-
-  // The switch that allows interim results.
-  // If enabled, results containing tentative hypotheses may be returned in
-  // addition to final ones.
-  // The service should silently ignore this field if it is unsupported.
-  bool enable_interim_results = 2;
-
-  // The switch to allow the service merging responses in the "hold response"
-  // state.
-  // If enabled and there is more than a single response held, the service does
-  // not return them in a batch. Instead, it tries to merge their results into
-  // a single response.
-  // The service should respond with the `INVALID_ARGUMENT` gRPC status code
-  // if the `recognition_alternatives_limit` field
-  // of the [`SpeechRecognitionConfig`](#SpeechRecognitionConfig) message is
-  // greater than 1.
-  // New in v1p1.
-  bool enable_held_responses_merging = 3;
-}
-
-// Streaming configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingConfig {
-  reserved 1; // bool enable_single_utterance = 1;
-
-  // The switch that enables manual control of the input timer.
-  // The timer imposes two constraints: one that finalizes recognition after
-  // a specified period unless speech is detected, and the other that limits
-  // the total time for an utterance. Manual control allows recognition to
-  // begin but delays enforcement of these constraints. The timer restarts
-  // after each detected end of utterance (each final result).
-  // If enabled, the timer does not start automatically. Instead, it can be
-  // initiated by sending
-  // a [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage)
-  // with the `start_input_timer` field set to `true` as needed. This should
-  // occur after the beginning of the request and be repeated after each final
-  // result.
-  bool enable_manual_input_timer = 2;
-
-  // The switch to automatically set the service in the "hold response" state
-  // at the beginning of the request and after each final result.
-  // The "hold response" state means that the internal recognition process
-  // continues, but results are kept, not returned. When needed, the state can
-  // be toggled into the "give response" state by sending
-  // the [`StreamingRecognizeRequestControlMessage`](#StreamingRecognizeRequestControlMessage)
-  // message with the `give_response` field set to `true`.
-  // In the "give response" state the service responds as soon as it is ready.
-  // Any held responses may be returned in a batch or as a single merged
-  // response, provided that the `enable_held_responses_merging` field
-  // of the [`ResultConfig`](#ResultConfig) message is set to `true`.
-  // New in v1p1.
-  bool enable_auto_hold_response = 3;
-}
-
-// Audio configuration of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message AudioConfig {
-  // The possible audio encodings.
-  enum AudioEncoding {
-    // Unspecified audio encoding.
-    UNSPECIFIED = 0;
-
-    // Linear pulse-code modulation of uncompressed 16-bit signed little-endian
-    // samples.
-    LINEAR16 = 1;
-
-    // Free Lossless Audio Codec ([FLAC](https://wiki.xiph.org/FLAC)).
-    // The encoding requires only about half the bandwidth of `LINEAR16`.
-    // 16-bit and 24-bit samples. Not all fields in `STREAMINFO` are supported.
-    // When set, the service ignores the `sampling_rate_hz` field and detects
-    // the actual value from audio header instead.
-    FLAC = 2;
-
-    // Ogg Encapsulated Opus Audio Codec ([OggOpus](https://wiki.xiph.org/OggOpus)).
-    // When set, the service ignores the `sampling_rate_hz` field and detects
-    // the actual value from audio header instead.
-    OGG_OPUS = 6;
-
-    // MP3 (ISO/IEC 11172-3 and ISO/IEC 13818-3).
-    // Only constant bitrate.
-    // When set, the service ignores the `sampling_rate_hz` field and detects
-    // the actual value from audio header instead.
-    MP3 = 8;
-  }
-
-  // The encoding of the audio data sent in the request. Single channel (mono)
-  // audio is assumed.
-  // The service should respond with the `INVALID_ARGUMENT` gRPC status code
-  // if the encoding is `UNSPECIFIED`.
-  // The service should respond with the `FAILED_PRECONDITION` gRPC status code
-  // if the encoding is not supported.
-  AudioEncoding encoding = 1;
-
-  // The sampling rate of the audio data sent in the request.
-  // The service should silently ignore the field for encodings that are sent
-  // along wtih headers, and detect the value from them instead.
-  // The service should respond with the `INVALID_ARGUMENT` gRPC status code
-  // if the value is not greater than 0.
-  float sampling_rate_hz = 2;
-}
-
-// Configuration of age recognition.
-message AgeRecognitionConfig {
-  // The switch that enables age recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_age_recognition = 1;
-}
-
-// Configuration of gender recognition.
-message GenderRecognitionConfig {
-  // The switch that enables gender recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_gender_recognition = 1;
-}
-
-// Configuration of language recognition.
-message LanguageRecognitionConfig {
-  // The switch that enables language recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_language_recognition = 1;
-}
-
-// Configuration for speech recognition.
-message SpeechRecognitionConfig {
-  // The switch that enables speech recognition for the request.
-  // If disabled or unspecified, the related results are excluded.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_speech_recognition = 1;
-
-  // The maximum number of alternative transcriptions allowed to be included
-  // per response.
-  // The actual count received can be less than the specified value and may
-  // also be equal to 0. If unspecified or 0, one alternative is allowed to be
-  // returned too.
-  uint32 recognition_alternatives_limit = 2;
-
-  // The switch that enables additional time alignment of recognitions in word
-  // details.
-  // If enabled, the `words` field of
-  // a [`SpeechRecognitionAlternative`](#SpeechRecognitionAlternative) message
-  // includes a list of [`SpeechRecognitionWord`](#SpeechRecognitionWord)
-  // messages. Otherwise, it remains empty.
-  // The service responds with the `FAILED_PRECONDITION` gRPC status code
-  // if requested but not enabled.
-  bool enable_time_alignment = 3;
-
-  // The name of a language group of models to be used.
-  // If left unspecified, it backs to the service's default group.
-  // The service responds with the `NOT_FOUND` gRPC status code
-  // if the name is not registered.
-  string language_group_name = 4;
-
-  // The name of a model to be used.
-  // If left unspecified, it backs to the selected langugage group's default.
-  // The service responds with the `NOT_FOUND` gRPC status code
-  // if the name is not registered.
-  string model_name = 5;
-
-  // Deprecated.
-  // The additional advanced service-dependend configuration for its speech
-  // recognizer. It may be silently ignored.
-  map<string, string> config_fields = 6;
-}
-
-// A message controlling the processing flow of
-// a [`StreamingRecognize`](#StreamingRecognize) request.
-message StreamingRecognizeRequestControlMessage {
-  reserved 2;
-
-  oneof control_message_content
-  {
-    // The flag that starts the input timer on demand and resets after each final
-    // result. It is silently ignored if the manual input timer setting is
-    // disabled for the request.
-    bool start_input_timer = 1;
-
-    // The flag to allow the service to return a response.
-    // After receiving this message, the service remains in the "give response"
-    // state. Ignored when the service is already in the "give response" state.
-    // Mutually exclusive with the `hold_response` field.
-    // New in v1p1.
-    bool give_response = 3;
-
-    // The flag to forbid the service from returning a response.
-    // After receiving this message, the service remains in the "hold response"
-    // state. Ignored when the service is already in the "hold response" state.
-    // Mutually exclusive with the `give_response` field.
-    // New in v1p1.
-    bool hold_response = 4;
-  }
-}
-
-// A message that carries data contents of
-// a [`StreamingRecognizeRequest`](#StreamingRecognizeRequest) request.
-message StreamingRecognizeRequestData {
-  // Part of the audio to perform recognition on.
-  Audio audio = 1;
-}
-
-// Audio contents.
-message Audio {
-  oneof audio_content {
-    // The audio data bytes.
-    bytes bytes = 1;
-  }
-}
-
-// A message streamed from the service through
-// the [`StreamingRecognize`](#StreamingRecognize) method.
-message StreamingRecognizeResponse {
-  // The combined recognition results for another part of the audio.
-  StreamingRecognizeResult result = 1;
-
-  // The cumulative duration of the processed audio during the request,
-  // not necessarily matching the actual length of the sent audio, mandatorily
-  // updated with each final result.
-  google.protobuf.Duration processed_audio_duration = 2;
-}
-
-// Combined recognition result.
-message StreamingRecognizeResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The flag indicating whether the result is interim or final.
-  bool is_final = 2;
-
-  // The anticipated causes for the service to finalize a result.
-  enum ResultFinalizationCause {
-    // The cause is not specified.
-    UNSPECIFIED = 0;
-
-    // The speech recognition result is not empty and the end of utterance
-    // is detected.
-    SUCCESS = 1;
-
-    // The speech recognition result is empty after the duration to expect
-    // a result is reached.
-    NO_INPUT_TIMEOUT = 2;
-
-    // The speech recognition result is not empty after the utterance duration
-    // limit is reached. The returned speech recognition is incomplete and
-    // should be completed in the following result.
-    SUCCESS_MAXTIME = 3;
-
-    // Unused.
-    PARTIAL_MATCH = 4;
-
-    // The speech recognition result is empty after the utterance duration
-    // limit is reached.
-    NO_MATCH_MAXTIME = 5;
-  }
-
-  // The field indicating the cause of result finalization.
-  // For interim results, the service should leave the field as `UNSPECIFIED`.
-  // For final results, the service must set the field to a value other than
-  // `UNSPECIFIED`.
-  ResultFinalizationCause result_finalization_cause = 3;
-
-  // The speech recognition result for another part of the processed audio,
-  // new with each final result, updates with each interim one.
-  // To obtain a complete result for all processed audio, for each final result
-  // received, a client should pick one of the result's recognition alternatives
-  // and buffer it on its own.
-  // It must be omitted if speech recognition is disabled.
-  SpeechRecognitionResult speech_recognition_result = 4;
-
-  // The current age recognition result for all processed audio,
-  // updated with each final result.
-  // It may be omitted in an interim result and must be omitted if age
-  // recognition is disabled.
-  AgeRecognitionResult age_recognition_result = 5;
-
-  // The current gender recognition result for all processed audio,
-  // updated with each final result.
-  // It may be omitted in an interim result and must be omitted if gender
-  // recognition is disabled.
-  GenderRecognitionResult gender_recognition_result = 6;
-
-  // The current language recognition result for all processed audio,
-  // updated with each final result.
-  // It may be omitted in an interim result and must be omitted if language
-  // recognition is disabled.
-  LanguageRecognitionResult language_recognition_result = 7;
-}
-
-// A result of age recognition.
-message AgeRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated AgeRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of age recognition.
-message AgeRecognitionAlternative {
-  // The assumed age of the person speaking in the audio, in years.
-  // For a reliable value, assure that there is only one person speaking in
-  // the audio.
-  uint32 age = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-}
-
-// A result of gender recognition.
-message GenderRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated GenderRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of gender recognition.
-message GenderRecognitionAlternative {
-  // The assumed gender of the person speaking in the audio.
-  // For a reliable value, assure that there is only one person speaking in
-  // the audio.
-  string gender = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-}
-
-// A result of language recognition.
-message LanguageRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated LanguageRecognitionAlternative recognition_alternatives = 2;
-}
-
-// An alternative hypothesis of language recognition.
-message LanguageRecognitionAlternative {
-  // The language spoken in the audio,
-  // a [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) tag.
-  string language = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-}
-
-// A result of speech recognition.
-message SpeechRecognitionResult {
-  // The recognition process status.
-  // It may communicate warnings. In case of an error hindering recognition,
-  // all other message fields should be left unset.
-  techmo.api.Status error = 1;
-
-  // The confidence-ordered list of alternative recognition hypotheses.
-  repeated SpeechRecognitionAlternative recognition_alternatives = 2;
-
-  // The actual name of the language group of the model,
-  // unrelated to the actual language spoken in the audio.
-  string language_group_name = 3;
-
-  // The actual name of the model used to obtain the result.
-  string model_name = 4;
-}
-
-// An alternative hypothesis of speech recognition.
-message SpeechRecognitionAlternative {
-  // The transcript of the audio.
-  string transcript = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-
-  // The details of the transcript's words.
-  // Empty unless `enable_time_alignment` is `true` in the request's
-  // [`SpeechRecognitionConfig`](#SpeechRecognitionConfig).
-  repeated SpeechRecognitionWord words = 3;
-}
-
-// Details of a single word in speech recognition.
-message SpeechRecognitionWord {
-  // The transcript of the word itself.
-  string transcript = 1;
-
-  // The confidence estimate, ranging from 0.0 to 1.0.
-  // Support for this feature is optional.
-  optional float confidence = 2;
-
-  // The start time of the word relative to the beginning of the entire audio.
-  google.protobuf.Duration start_time = 3;
-
-  // The end time of the word relative to the beginning of the entire audio.
-  google.protobuf.Duration end_time = 4;
-}
diff --git a/pyproject.toml b/pyproject.toml
index 873297f..3e5c133 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,23 @@
 [build-system]
-requires = ["grpcio-tools>=1.49.4,<1.63", "setuptools>=61"]
+requires = ["grpcio-tools>=1.49.4,<1.71.0", "setuptools>=61"]
 build-backend = "setuptools.build_meta"
 
 [project]
 name = "techmo-asr-api"
-description = "Techmo ASR API (public)"
+description = "Techmo ASR API"
 dynamic = ["version"]
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [{ name = "Techmo sp. z o.o", email = "kontakt@techmo.pl" }]
 requires-python = ">=3.8"
-dependencies = ["grpcio>=1.49.4,<1.63", "protobuf>=4.21.3,<5"]
+dependencies = [
+    "grpcio>=1.49.4,<1.71.0; python_version=='3.8'",
+    "grpcio>=1.49.4; python_version>='3.9'",
+    "protobuf>=4.21.3,<6.0.0; python_version=='3.8'",
+    "protobuf>=4.21.3; python_version>='3.9'",
+]
+
+[project.optional-dependencies]
+tests = ["pytest<8,>=7.4.4", "pytest-cov>=4.1", "pytest-lazy-fixture>=0.6.3"]
 
 [project.urls]
 repository = "https://github.com/techmo-pl/asr-api-python"
@@ -19,3 +27,28 @@ version = { attr = "asr_api.VERSION.__version__" }
 
 [tool.setuptools.packages.find]
 include = ["asr_api*", "google*", "techmo*"]
+
+[tool.pytest.ini_options]
+addopts = ["--strict-markers"]
+markers = ["""api(name): mark tests as defined for <name> API. \
+    Example: api('techmo.asr.api.v1p1'). \
+    Use the `--api=<name>` option to collect the marked tests."""]
+testpaths = ["tests"]
+
+[tool.coverage.report]
+precision = 1
+show_missing = true
+
+[tool.mypy]
+
+[[tool.mypy.overrides]]
+module = ["techmo.*", "google.*"]
+ignore_errors = true
+
+[[tool.mypy.overrides]]
+module = "tests.*"
+disallow_untyped_decorators = false
+
+[tool.coverage.run]
+source_pkgs = ["asr_api"]
+relative_files = true
diff --git a/setup.py b/setup.py
index 196ac3d..2ca8195 100644
--- a/setup.py
+++ b/setup.py
@@ -1,71 +1,90 @@
+import os
 from pathlib import Path
-from typing import List
+from typing import Any, Optional, Sequence, Union
 
 import setuptools
 
+_PathLike = Union[str, bytes, "os.PathLike[Any]"]
+_PathLikes = Sequence[_PathLike]
 
-def protoc(args: List[str]):
-    import pkg_resources
+
+def _update_submodule(
+    submodule_path: _PathLike,
+    git_submodule_update_options: Sequence[str] = ("--init", "--depth", "1", "--"),
+    working_directory_path: Optional[_PathLike] = None,
+) -> None:
+    import subprocess
+
+    if (Path(str(working_directory_path) if working_directory_path else ".") / str(submodule_path) / ".git").exists():
+        return
+
+    if (
+        subprocess.call(
+            command := (("git", "submodule", "update") + tuple(git_submodule_update_options) + (str(submodule_path),)),
+            cwd=working_directory_path,
+        )
+        != 0
+    ):
+        raise Exception(f"error: {command} failed")
+
+
+def _protoc(*args: str) -> None:
+    import grpc_tools
     from grpc_tools import protoc
 
-    command = [
-        "grpc_tools.protoc",
-        "--proto_path={}".format(
-            Path(pkg_resources.resource_filename("grpc_tools", "_proto"))
-        ),
-    ] + args
-
-    if protoc.main(command) != 0:
-        raise Exception("error: {} failed".format(command))
-
-
-def build_package_grpc_protos(
-    protos_paths: List[Path], import_directory_paths: List[Path] = []
-):
-    protoc(
-        [
-            "--proto_path={}".format(Path(import_directory_path))
-            for import_directory_path in import_directory_paths
-        ]
-        + ["--grpc_python_out=."]
-        + protos_paths,
+    if (
+        protoc.main(
+            command := (
+                "grpc_tools.protoc",
+                "--proto_path={}".format(Path(grpc_tools.__file__).parent / "_proto"),
+            )
+            + args
+        )
+        != 0
+    ):
+        raise Exception(f"error: {command} failed")
+
+
+def _build_package_grpc_protos(
+    proto_paths: _PathLikes,
+    import_directory_paths: Optional[_PathLikes] = None,
+) -> None:
+    _protoc(
+        *(f"--proto_path={str(import_directory_path)}" for import_directory_path in import_directory_paths or ()),
+        "--grpc_python_out=.",
+        *(str(proto_path) for proto_path in proto_paths),
     )
 
 
-def build_package_protos(
-    protos_paths: List[Path], import_directory_paths: List[Path] = []
-):
-    protoc(
-        [
-            "--proto_path={}".format(Path(import_directory_path))
-            for import_directory_path in import_directory_paths
-        ]
-        + ["--python_out=."]
-        + protos_paths,
+def _build_package_protos(
+    proto_paths: _PathLikes,
+    import_directory_paths: Optional[_PathLikes] = None,
+) -> None:
+    _protoc(
+        *(f"--proto_path={str(import_directory_path)}" for import_directory_path in import_directory_paths or ()),
+        "--python_out=.",
+        *(str(proto_path) for proto_path in proto_paths),
     )
 
 
-build_package_protos(
-    protos_paths=[
-        "./proto/google/rpc/status.proto",
-        "./proto/techmo/api/status.proto",
-        "./proto/techmo/asr/api/dictation/asr.proto",
-        "./proto/techmo/asr/api/v1/asr.proto",
-        "./proto/techmo/asr/api/v1p1/asr.proto",
-    ],
-    import_directory_paths=[
-        "./proto",
-    ],
+_update_submodule("./submodules/asr-api")
+_build_package_protos(
+    (
+        "./submodules/asr-api/proto/google/rpc/status.proto",
+        "./submodules/asr-api/proto/techmo/api/status.proto",
+        "./submodules/asr-api/proto/techmo/asr/api/dictation/asr.proto",
+        "./submodules/asr-api/proto/techmo/asr/api/v1/asr.proto",
+        "./submodules/asr-api/proto/techmo/asr/api/v1p1/asr.proto",
+    ),
+    import_directory_paths=("./submodules/asr-api/proto",),
 )
-build_package_grpc_protos(
-    protos_paths=[
-        "./proto/techmo/asr/api/dictation/asr.proto",
-        "./proto/techmo/asr/api/v1/asr.proto",
-        "./proto/techmo/asr/api/v1p1/asr.proto",
-    ],
-    import_directory_paths=[
-        "./proto",
-    ],
+_build_package_grpc_protos(
+    (
+        "./submodules/asr-api/proto/techmo/asr/api/dictation/asr.proto",
+        "./submodules/asr-api/proto/techmo/asr/api/v1/asr.proto",
+        "./submodules/asr-api/proto/techmo/asr/api/v1p1/asr.proto",
+    ),
+    import_directory_paths=("./submodules/asr-api/proto",),
 )
 
 setuptools.setup()
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 0000000..f319aa5
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#
+# usage: ./setup.sh
+#
+# Run once after cloning: initialises git submodules.
+
+set -euo pipefail
+
+git submodule sync --recursive
+git submodule update --init --recursive
diff --git a/submodules/asr-api b/submodules/asr-api
new file mode 160000
index 0000000..084c836
--- /dev/null
+++ b/submodules/asr-api
@@ -0,0 +1 @@
+Subproject commit 084c836bff448aff140dd2391499a297aacabc4f
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..7650569
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,22 @@
+from typing import List
+
+import pytest
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption(
+        "--api",
+        default=None,
+        choices=[
+            "techmo.asr.api.dictation",
+            "techmo.asr.api.v1",
+            "techmo.asr.api.v1p1",
+        ],
+        help="the argument of tests marked with the `@pytest.mark.api(name)` marker to be collected; one of: %(choices)s (default is %(default)r)",
+        metavar="name",
+    )
+
+
+def pytest_collection_modifyitems(config: pytest.Config, items: List[pytest.Item]) -> None:
+    if api := config.getoption("--api"):
+        items[:] = (item for item in items if (mark := item.get_closest_marker("api")) and mark.args and mark.args[0] == api)
diff --git a/tests/test_asr_api.py b/tests/test_asr_api.py
new file mode 100644
index 0000000..2dcf141
--- /dev/null
+++ b/tests/test_asr_api.py
@@ -0,0 +1,140 @@
+from typing import cast
+
+import pytest
+
+
+@pytest.fixture
+def asr_api_dictation() -> object:
+    import asr_api.dictation
+
+    return asr_api.dictation
+
+
+@pytest.fixture(
+    params=(
+        "Speech",
+        "RecognizeRequest",
+        "StreamingRecognizeRequest",
+        "StreamingRecognitionConfig",
+        "RecognitionConfig",
+        "SpeechDurationConfig",
+        "SpeechDurationThresholdMode",
+        "SpeechContext",
+        "ConfigField",
+        "RecognitionAudio",
+        "RecognizeResponse",
+        "StreamingRecognizeResponse",
+        "StreamingRecognitionResult",
+        "SpeechRecognitionResult",
+        "SpeechRecognitionAlternative",
+        "WordInfo",
+        "RecognitionLattice",
+        "LatticeEdge",
+        "Gender",
+        "Age",
+    ),
+)
+def asr_api_dictation_attr(request: pytest.FixtureRequest) -> str:
+    return cast(str, request.param)
+
+
+@pytest.fixture
+def asr_api_v1() -> object:
+    import asr_api.v1
+
+    return asr_api.v1
+
+
+@pytest.fixture(
+    params=(
+        "Asr",
+        "StreamingRecognizeRequest",
+        "StreamingRecognizeRequestConfig",
+        "ResultConfig",
+        "StreamingConfig",
+        "AudioConfig",
+        "AgeRecognitionConfig",
+        "GenderRecognitionConfig",
+        "LanguageRecognitionConfig",
+        "SpeechRecognitionConfig",
+        "StreamingRecognizeRequestControlMessage",
+        "StreamingRecognizeRequestData",
+        "Audio",
+        "StreamingRecognizeResponse",
+        "StreamingRecognizeResult",
+        "AgeRecognitionResult",
+        "AgeRecognitionAlternative",
+        "GenderRecognitionResult",
+        "GenderRecognitionAlternative",
+        "LanguageRecognitionResult",
+        "LanguageRecognitionAlternative",
+        "SpeechRecognitionResult",
+        "SpeechRecognitionAlternative",
+        "SpeechRecognitionWord",
+    ),
+)
+def asr_api_v1_attr(request: pytest.FixtureRequest) -> str:
+    return cast(str, request.param)
+
+
+@pytest.fixture
+def asr_api_v1p1() -> object:
+    import asr_api.v1p1
+
+    return asr_api.v1p1
+
+
+@pytest.fixture(
+    params=(
+        "Asr",
+        "StreamingRecognizeRequest",
+        "StreamingRecognizeRequestConfig",
+        "ResultConfig",
+        "StreamingConfig",
+        "AudioConfig",
+        "AgeRecognitionConfig",
+        "GenderRecognitionConfig",
+        "LanguageRecognitionConfig",
+        "SpeechRecognitionConfig",
+        "StreamingRecognizeRequestControlMessage",
+        "StreamingRecognizeRequestData",
+        "Audio",
+        "StreamingRecognizeResponse",
+        "StreamingRecognizeResult",
+        "AgeRecognitionResult",
+        "AgeRecognitionAlternative",
+        "GenderRecognitionResult",
+        "GenderRecognitionAlternative",
+        "LanguageRecognitionResult",
+        "LanguageRecognitionAlternative",
+        "SpeechRecognitionResult",
+        "SpeechRecognitionAlternative",
+        "SpeechRecognitionWord",
+    ),
+)
+def asr_api_v1p1_attr(request: pytest.FixtureRequest) -> str:
+    return cast(str, request.param)
+
+
+@pytest.mark.parametrize(
+    "api, attr",
+    (
+        pytest.param(
+            pytest.lazy_fixture("asr_api_dictation"),
+            pytest.lazy_fixture("asr_api_dictation_attr"),
+            marks=pytest.mark.api("techmo.asr.api.dictation"),
+        ),
+        pytest.param(
+            pytest.lazy_fixture("asr_api_v1"),
+            pytest.lazy_fixture("asr_api_v1_attr"),
+            marks=pytest.mark.api("techmo.asr.api.v1"),
+        ),
+        pytest.param(
+            pytest.lazy_fixture("asr_api_v1p1"),
+            pytest.lazy_fixture("asr_api_v1p1_attr"),
+            marks=pytest.mark.api("techmo.asr.api.v1p1"),
+        ),
+    ),
+)
+def test_hasattr(api: object, attr: str) -> None:
+    assert hasattr(api, attr)
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..ce99428
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,36 @@
+[tox]
+# Python 3.7 is not downloadable via uv; minimum testable version is 3.8.
+# Python 3.14 is included to catch forward-compat issues early.
+envlist = py38, py39, py310, py311, py312, py313, py314
+requires =
+    tox-uv>=1
+
+[testenv]
+# skip_install keeps each env lightweight: the package is found via PYTHONPATH
+# rather than doing a full editable install in each env.
+# Proto stubs (techmo/asr/api/*_pb2.py) are gitignored — they are generated
+# by setup.py at build time.  In CI the before_script runs ./install.sh first;
+# locally, run `./install.sh` first (requires grpcio-tools and the asr-api
+# submodule).
+skip_install = true
+set_env = PYTHONPATH = {toxinidir}
+# Pass service-address variables so integration tests can connect to a live service
+# when run via tox (e.g. tox -e py312 -- -m integration).
+passenv =
+    ASR_*
+deps =
+    # grpcio 1.71.0 dropped Python 3.8
+    grpcio>=1.49.4,<1.71.0; python_version=="3.8"
+    grpcio>=1.49.4; python_version>="3.9"
+    protobuf>=4.21.3,<6.0.0; python_version=="3.8"
+    protobuf>=4.21.3; python_version>="3.9"
+    pytest>=7.4.4,<8
+    pytest-cov>=4.1
+    pytest-lazy-fixture>=0.6.3
+commands_pre =
+    # Abort early with a clear message if proto stubs are missing rather than
+    # letting pytest fail with a cryptic ImportError deep in imports.
+    # PYTHONPATH already contains {toxinidir} so we use it to locate the stub.
+    python -c "import os, sys; stub = os.path.join(os.environ['PYTHONPATH'], 'techmo', 'asr', 'api', 'dictation', 'asr_pb2.py'); sys.exit(0) if os.path.exists(stub) else sys.exit('Proto stubs missing. Run: ./install.sh')"
+commands =
+    pytest --color=yes --cov=asr_api --cov-report=term-missing --cov-report=xml:{envtmpdir}/coverage.xml {posargs}