+ {%- set dep = class.extra.hopsworks_apigen.deprecated -%}
+ {%- set available_until = dep.available_until -%}
+ {%- set deprecated_by = dep.deprecated_by -%}
+ {%- set version = "version " + available_until if available_until else "a future release" -%}
+
+ {{ class.name }} is deprecated and will be removed in {{ version }} of Hopsworks.
+ Instead of it, consider using {% for rec in deprecated_by %}{{ rec.split(".")[-2:] | join(".") }}{% if not loop.last %}, {% endif %}{% endfor %}.
+
+
+ {% endif %}
+ {% endblock deprecation %}
+
{% block docstring scoped %}
{#- Docstring block.
diff --git a/docs/templates/python/material/function.html.jinja b/docs/templates/python/material/function.html.jinja
index 7025253c5..16eae44e5 100644
--- a/docs/templates/python/material/function.html.jinja
+++ b/docs/templates/python/material/function.html.jinja
@@ -160,6 +160,33 @@ Context:
It contains other blocks that users can override.
Overriding the contents block allows to rearrange the order of the blocks.
-#}
+ {% block deprecation scoped %}
+ {% if function.extra.hopsworks_apigen and function.extra.hopsworks_apigen.deprecated %}
+
+ Deprecated
+
+ {%- set dep = function.extra.hopsworks_apigen.deprecated -%}
+ {%- set available_until = dep.available_until -%}
+ {%- set deprecated_by = dep.deprecated_by -%}
+ {%- set version = "version " + available_until if available_until else "a future release" -%}
+
+ {{ function.name }} is deprecated and will be removed in {{ version }} of Hopsworks.
+ Instead of it, consider using {% for rec in deprecated_by %}{{ rec.split(".")[-2:] | join(".") }}{% if not loop.last %}, {% endif %}{% endfor %}.
+
+
+ {% endif %}
+ {% endblock deprecation %}
+
+ {% block docstring scoped %}
+ {#- Docstring block.
+
+ This block renders the docstring for the function.
+ -#}
+ {% with docstring_sections = function.docstring.parsed %}
+ {% include "docstring.html.jinja" with context %}
+ {% endwith %}
+ {% endblock docstring %}
+
{% block aliases scoped %}
{% if function.extra.hopsworks_apigen and function.extra.hopsworks_apigen.aliases %}
{%- set public_aliases = function.extra.hopsworks_apigen.aliases | selectattr("is_public") | list -%}
@@ -184,16 +211,6 @@ Context:
{% endif %}
{% endblock aliases %}
- {% block docstring scoped %}
- {#- Docstring block.
-
- This block renders the docstring for the function.
- -#}
- {% with docstring_sections = function.docstring.parsed %}
- {% include "docstring.html.jinja" with context %}
- {% endwith %}
- {% endblock docstring %}
-
{% block source scoped %}
{#- Source block.
From 2b26deaf90986a8aa1d333d38d2c376502266de8 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Tue, 24 Feb 2026 16:06:24 +0100
Subject: [PATCH 05/16] Add Arrow crossref
---
mkdocs.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/mkdocs.yml b/mkdocs.yml
index 749cebddb..a4fda9886 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -373,6 +373,7 @@ plugins:
- https://fastapi.tiangolo.com/objects.inv
- https://scikit-learn.org/stable/objects.inv
- https://docs.pola.rs/api/python/stable/objects.inv
+ - https://arrow.apache.org/docs/objects.inv
markdown_extensions:
- admonition
From fb68e4cc3896652d3ea04834abfcd0313eb24168 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Tue, 24 Feb 2026 16:17:56 +0100
Subject: [PATCH 06/16] Fix "Returned by" rendering
---
docs/templates/python/material/class.html.jinja | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/docs/templates/python/material/class.html.jinja b/docs/templates/python/material/class.html.jinja
index 51cbe3b50..e0d0834c1 100644
--- a/docs/templates/python/material/class.html.jinja
+++ b/docs/templates/python/material/class.html.jinja
@@ -294,13 +294,12 @@ Context:
{% if class.extra.hopsworks_apigen and class.extra.hopsworks_apigen.aliases %}
Returned by
-
-
+
{%- for alias in class.extra.hopsworks_apigen.aliases -%}
{%- set alias_id = alias.target_module + "." + alias.alias_name -%}
- {% if alias_id != class.path %}
+ {%- if alias_id != class.path -%}
- {% endif %}
+ {%- endif -%}
{%- endfor -%}
From 1ca7b8b14c5fafef51f6b9238108e12f0db1dd06 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Wed, 25 Feb 2026 11:16:49 +0100
Subject: [PATCH 07/16] Improve exampels rendering
---
docs/user_guides/fs/data_source/usage.md | 9 +
.../fs/feature_group/data_types.md | 11 +-
.../on_demand_transformations.md | 303 +++++++++---------
.../user_guides/fs/feature_view/batch-data.md | 20 +-
.../fs/feature_view/feature-vectors.md | 64 ++--
.../fs/feature_view/feature_monitoring.md | 8 +
.../fs/feature_view/helper-columns.md | 35 +-
.../model-dependent-transformations.md | 55 ++--
docs/user_guides/fs/feature_view/overview.md | 9 +
docs/user_guides/fs/feature_view/query.md | 27 +-
.../fs/feature_view/training-data.md | 34 +-
.../fs/transformation_functions.md | 68 ++--
docs/user_guides/migration/40_migration.md | 2 +
.../mlops/registry/frameworks/llm.md | 4 +
.../mlops/registry/frameworks/python.md | 4 +
.../mlops/registry/frameworks/skl.md | 4 +
.../mlops/registry/frameworks/tch.md | 4 +
.../mlops/registry/frameworks/tf.md | 4 +
.../mlops/registry/input_example.md | 3 +
.../mlops/registry/model_evaluation_images.md | 3 +
.../mlops/registry/model_schema.md | 3 +
docs/user_guides/mlops/serving/predictor.md | 5 +
docs/user_guides/mlops/serving/rest-api.md | 7 +-
docs/user_guides/mlops/serving/transformer.md | 1 +
docs/user_guides/projects/opensearch/knn.md | 5 +
25 files changed, 418 insertions(+), 274 deletions(-)
diff --git a/docs/user_guides/fs/data_source/usage.md b/docs/user_guides/fs/data_source/usage.md
index 49e049195..46b498809 100644
--- a/docs/user_guides/fs/data_source/usage.md
+++ b/docs/user_guides/fs/data_source/usage.md
@@ -16,6 +16,7 @@ We will walk through each functionality in the sections below.
We retrieve a data source simply by its unique name.
=== "PySpark"
+
```python
import hopsworks
# Connect to the Hopsworks feature store
@@ -26,6 +27,7 @@ We retrieve a data source simply by its unique name.
```
=== "Scala"
+
```scala
import com.logicalclocks.hsfs._
val connection = HopsworksConnection.builder().build();
@@ -46,12 +48,14 @@ The exact behaviour could change depending on the fdata source type, but broadly
For data sources based on object/file storage such as AWS S3, ADLS, GCS, we set the full object path in the `path` argument and users should pass a Spark data format (parquet, csv, orc, hudi, delta) to the `data_format` argument.
=== "PySpark"
+
```python
# read data into dataframe using path
df = connector.read(data_format='data_format', path='fileScheme://bucket/path/')
```
=== "Scala"
+
```scala
// read data into dataframe using path
val df = connector.read("", "data_format", new HashMap(), "fileScheme://bucket/path/")
@@ -75,6 +79,7 @@ Using `prepare_spark` is also not necessary when using the `read` API.
For example, to read directly from a S3 connector, we use the `prepare_spark` as follows:
=== "PySpark"
+
```python
connector.prepare_spark()
spark.read.format("json").load("s3a://[bucket]/path")
@@ -90,6 +95,7 @@ Depending on the connector type, users can also just set the table path and read
This is mostly relevant for Google BigQuery.
=== "PySpark"
+
```python
# read results from a SQL
df = connector.read(query="SELECT * FROM TABLE")
@@ -98,6 +104,7 @@ This is mostly relevant for Google BigQuery.
```
=== "Scala"
+
```scala
// read results from a SQL
val df = connector.read("SELECT * FROM TABLE", "" , new HashMap(),"")
@@ -125,6 +132,7 @@ Depending on the external source, we should set either the `query` argument for
Example for any data warehouse/SQL based external sources, we set the desired SQL to `query` argument, and set the `data_source` argument to the data source object of desired data source.
=== "PySpark"
+
```python
ds.query="SELECT * FROM TABLE"
@@ -147,6 +155,7 @@ Data Sources are also used while writing training data to external sources.
While calling the [Feature View](../../../concepts/fs/feature_view/fv_overview.md) API `create_training_data`, we can pass the `data_source` argument which is necessary to materialise the data to external sources, as shown below.
=== "PySpark"
+
```python
# materialise a training dataset
version, job = feature_view.create_training_data(
diff --git a/docs/user_guides/fs/feature_group/data_types.md b/docs/user_guides/fs/feature_group/data_types.md
index d5b42395c..2f8a1bfd3 100644
--- a/docs/user_guides/fs/feature_group/data_types.md
+++ b/docs/user_guides/fs/feature_group/data_types.md
@@ -166,10 +166,13 @@ The byte size of each column is determined by its data type and calculated as fo
For online enabled feature groups, the dataframe to be ingested needs to adhere to the online schema definitions.
The input dataframe is validated for schema checks accordingly.
The validation is enabled by default and can be disabled by setting below key word argument when calling `insert()`
+
=== "Python"
+
```python
feature_group.insert(df, validation_options={'online_schema_validation':False})
```
+
The most important validation checks or error messages are mentioned below along with possible corrective actions.
01. Primary key contains null values
@@ -179,6 +182,7 @@ The most important validation checks or error messages are mentioned below along
Alternatively, find the null values and assign them an unique value as per preferred strategy for data imputation.
=== "Pandas"
+
```python
# Drop rows: assuming 'id' is the primary key column
df = df.dropna(subset=['id'])
@@ -202,6 +206,7 @@ The most important validation checks or error messages are mentioned below along
- **Example correction** Add all the primary key columns in the dataframe.
=== "Pandas"
+
```python
# incrementing primary key upto the length of dataframe
df['id'] = range(1, len(df) + 1)
@@ -216,6 +221,7 @@ The most important validation checks or error messages are mentioned below along
- Trim the string values to fit within maximum limit set during feature group creation.
=== "Pandas"
+
```python
max_length = 100
df['text_column'] = df['text_column'].str.slice(0, max_length)
@@ -223,12 +229,13 @@ The most important validation checks or error messages are mentioned below along
- Another option is to simply [create new version of the feature group][hsfs.feature_store.FeatureStore.get_or_create_feature_group] and insert the dataframe.
- !!!note
+ !!! note
The total row size limit should be less than 30kb as per [row size restrictions](#online-restrictions-for-row-size).
In such cases it is possible to define the feature as **TEXT** or **BLOB**.
Below is an example of explicitly defining the string column as TEXT as online type.
=== "Pandas"
+
```python
import pandas as pd
# example dummy dataframe with the string column
@@ -279,6 +286,7 @@ If users explicitly define the schema for the feature group, Hopsworks is going
You can explicitly define the feature group schema as follows:
=== "Python"
+
```python
from hsfs.feature import Feature
@@ -299,6 +307,7 @@ Hopsworks supports appending additional features to an existing feature group.
Adding additional features to an existing feature group is not considered a breaking change.
=== "Python"
+
```python
from hsfs.feature import Feature
diff --git a/docs/user_guides/fs/feature_group/on_demand_transformations.md b/docs/user_guides/fs/feature_group/on_demand_transformations.md
index 8cabf1370..57cdfac06 100644
--- a/docs/user_guides/fs/feature_group/on_demand_transformations.md
+++ b/docs/user_guides/fs/feature_group/on_demand_transformations.md
@@ -18,45 +18,47 @@ Alternatively, the name of the resulting on-demand feature can be explicitly def
Each on-demand transformation function can map specific features to its arguments by explicitly providing their names as arguments to the transformation function.
If no feature names are provided, the transformation function will default to using features that match the name of the transformation function's argument.
-=== "Python"
!!! example "Creating on-demand transformation functions."
- ```python
- # Define transformation function
- @hopsworks.udf(return_type=int, drop=["current_date"])
- def transaction_age(transaction_date, current_date):
- return (current_date - transaction_date).dt.days
-
- @hopsworks.udf(return_type=[str, str], drop=["current_date"])
- def stripped_strings(country, city):
- return country.strip(), city.strip()
-
- # Attach transformation function to feature group to create on-demand transformation function.
- fg = feature_store.create_feature_group(name="fg_transactions",
- version=1,
- description="Transaction Features",
- online_enabled=True,
- primary_key=['id'],
- event_time='event_time',
- transformation_functions=[transaction_age, stripped_strings]
- )
- ```
+ === "Python"
+
+ ```python
+ # Define transformation function
+ @hopsworks.udf(return_type=int, drop=["current_date"])
+ def transaction_age(transaction_date, current_date):
+ return (current_date - transaction_date).dt.days
+
+ @hopsworks.udf(return_type=[str, str], drop=["current_date"])
+ def stripped_strings(country, city):
+ return country.strip(), city.strip()
+
+ # Attach transformation function to feature group to create on-demand transformation function.
+ fg = feature_store.create_feature_group(name="fg_transactions",
+ version=1,
+ description="Transaction Features",
+ online_enabled=True,
+ primary_key=['id'],
+ event_time='event_time',
+ transformation_functions=[transaction_age, stripped_strings]
+ )
+ ```
### Specifying input features
The features to be used by the on-demand transformation function can be specified by providing the feature names as input to the transformation functions.
-=== "Python"
!!! example "Creating on-demand transformations by specifying features to be passed to transformation function."
- ```python
- fg = feature_store.create_feature_group(name="fg_transactions",
- version=1,
- description="Transaction Features",
- online_enabled=True,
- primary_key=['id'],
- event_time='event_time',
- transformation_functions=[age_transaction('transaction_time', 'current_time')]
- )
- ```
+ === "Python"
+
+ ```python
+ fg = feature_store.create_feature_group(name="fg_transactions",
+ version=1,
+ description="Transaction Features",
+ online_enabled=True,
+ primary_key=['id'],
+ event_time='event_time',
+ transformation_functions=[age_transaction('transaction_time', 'current_time')]
+ )
+ ```
## Usage
@@ -77,23 +79,23 @@ Inserting on-demand features as historical features saves time and computational
A feature view can include on-demand features from feature groups by selecting them in the [query](../feature_view/query.md) used to create the feature view.
These on-demand features are equivalent to regular features, and [model-dependent transformations](../feature_view/model-dependent-transformations.md) can be applied to them if required.
-=== "Python"
!!! example "Creating feature view with on-demand features"
- ```python
-
- # Selecting on-demand features in query
- query = fg.select(["id", "feature1", "feature2", "on_demand_feature3", "on_demand_feature4"])
-
- # Creating a feature view using a query that contains on-demand transformations and model-dependent transformations
- feature_view = fs.create_feature_view(
- name='transactions_view',
- query=query,
- transformation_functions=[
- min_max_scaler("feature1"),
- min_max_scaler("on_demand_feature3"),
- ]
- )
- ```
+ === "Python"
+
+ ```python
+ # Selecting on-demand features in query
+ query = fg.select(["id", "feature1", "feature2", "on_demand_feature3", "on_demand_feature4"])
+
+ # Creating a feature view using a query that contains on-demand transformations and model-dependent transformations
+ feature_view = fs.create_feature_view(
+ name='transactions_view',
+ query=query,
+ transformation_functions=[
+ min_max_scaler("feature1"),
+ min_max_scaler("on_demand_feature3"),
+ ]
+ )
+ ```
### Computing on-demand features
@@ -113,66 +115,69 @@ However, if the required input parameters are also not present in the feature ve
The `get_feature_vector` function retrieves a single feature vector based on the feature view's serving key(s).
The on-demand features in the feature vector can be computed using real-time data by passing a dictionary that associates the name of each input parameter needed for the on-demand transformation function with its respective new value to the `request_parameter` argument.
-=== "Python"
!!! example "Computing on-demand features while retrieving a feature vector"
- ```python
- feature_vector = feature_view.get_feature_vector(
- entry={"id": 1},
- request_parameter={
- "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
- "current_time": datetime.now(),
- },
- )
- ```
+ === "Python"
+
+ ```python
+ feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1},
+ request_parameter={
+ "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
+ "current_time": datetime.now(),
+ },
+ )
+ ```
#### Retrieving feature vectors
The `get_feature_vectors` function retrieves multiple feature vectors using a list of feature view serving keys.
The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each serving key or can be a dictionary if the on-demand transformations require the same parameters for all serving keys.
-=== "Python"
!!! example "Computing on-demand features while retrieving a feature vectors"
- ```python
- # Specify unique request parameters for each serving key.
- feature_vector = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}],
- request_parameter=[
- {
+ === "Python"
+
+ ```python
+ # Specify unique request parameters for each serving key.
+ feature_vector = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}],
+ request_parameter=[
+ {
+ "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
+ "current_time": datetime.now(),
+ },
+ {
+ "transaction_time": datetime(2022, 11, 20, 12, 50, 00),
+ "current_time": datetime.now(),
+ },
+ ],
+ )
+
+ # Specify common request parameters for all serving key.
+ feature_vector = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}],
+ request_parameter={
"transaction_time": datetime(2022, 12, 28, 23, 55, 59),
"current_time": datetime.now(),
},
- {
- "transaction_time": datetime(2022, 11, 20, 12, 50, 00),
- "current_time": datetime.now(),
- },
- ],
- )
-
- # Specify common request parameters for all serving key.
- feature_vector = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}],
- request_parameter={
- "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
- "current_time": datetime.now(),
- },
- )
- ```
+ )
+ ```
#### Retrieving feature vector without on-demand features
The `get_feature_vector` and `get_feature_vectors` methods can return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features.
To achieve this, set the parameters `transform` and `on_demand_features` to `False`.
-=== "Python"
!!! example "Returning untransformed feature vectors"
- ```python
- untransformed_feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False, on_demand_features=False
- )
- untransformed_feature_vectors = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
- )
- ```
+ === "Python"
+
+ ```python
+ untransformed_feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1}, transform=False, on_demand_features=False
+ )
+ untransformed_feature_vectors = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
+ )
+ ```
#### Compute all on-demand features
@@ -182,75 +187,75 @@ The `transform` function can be used to apply model-dependent transformations to
The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each feature vector given as input to the function or can be a dictionary if the on-demand transformations require the same parameters for all input feature vectors.
-=== "Python"
!!! example "Computing all on-demand features and manually applying model dependent transformations."
- ```python
- # Specify request parameters for each serving key.
- untransformed_feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False, on_demand_features=False
- )
-
- # re-compute and add on-demand features to the feature vector
- feature_vector_with_on_demand_features = fv.compute_on_demand_features(
- untransformed_feature_vector,
- request_parameter={
- "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
- "current_time": datetime.now(),
- },
- )
-
- # Applying model dependent transformations
- encoded_feature_vector = fv.transform(feature_vector_with_on_demand_features)
-
- # Specify request parameters for each serving key.
- untransformed_feature_vectors = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
- )
-
- # re-compute and add on-demand features to the feature vectors - Specify unique request parameter for each feature vector
- feature_vectors_with_on_demand_features = fv.compute_on_demand_features(
- untransformed_feature_vectors,
- request_parameter=[
- {
+ === "Python"
+
+ ```python
+ # Specify request parameters for each serving key.
+ untransformed_feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1}, transform=False, on_demand_features=False
+ )
+
+ # re-compute and add on-demand features to the feature vector
+ feature_vector_with_on_demand_features = fv.compute_on_demand_features(
+ untransformed_feature_vector,
+ request_parameter={
"transaction_time": datetime(2022, 12, 28, 23, 55, 59),
"current_time": datetime.now(),
},
- {
- "transaction_time": datetime(2022, 11, 20, 12, 50, 00),
- "current_time": datetime.now(),
- },
- ],
- )
+ )
- # re-compute and add on-demand feature to the feature vectors - Specify common request parameter for all feature vectors
- feature_vectors_with_on_demand_features = fv.compute_on_demand_features(
- untransformed_feature_vectors,
- request_parameter={
- "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
- "current_time": datetime.now(),
- },
- )
+ # Applying model dependent transformations
+ encoded_feature_vector = fv.transform(feature_vector_with_on_demand_features)
- # Applying model dependent transformations
- encoded_feature_vector = fv.transform(feature_vectors_with_on_demand_features)
+ # Specify request parameters for each serving key.
+ untransformed_feature_vectors = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
+ )
- ```
+ # re-compute and add on-demand features to the feature vectors - Specify unique request parameter for each feature vector
+ feature_vectors_with_on_demand_features = fv.compute_on_demand_features(
+ untransformed_feature_vectors,
+ request_parameter=[
+ {
+ "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
+ "current_time": datetime.now(),
+ },
+ {
+ "transaction_time": datetime(2022, 11, 20, 12, 50, 00),
+ "current_time": datetime.now(),
+ },
+ ],
+ )
+
+ # re-compute and add on-demand feature to the feature vectors - Specify common request parameter for all feature vectors
+ feature_vectors_with_on_demand_features = fv.compute_on_demand_features(
+ untransformed_feature_vectors,
+ request_parameter={
+ "transaction_time": datetime(2022, 12, 28, 23, 55, 59),
+ "current_time": datetime.now(),
+ },
+ )
+
+ # Applying model dependent transformations
+ encoded_feature_vector = fv.transform(feature_vectors_with_on_demand_features)
+ ```
#### Compute one on-demand feature
On-demand transformation functions can also be accessed and executed as normal functions by using the dictionary `on_demand_transformations` that maps the on-demand features to their corresponding on-demand transformation function.
-=== "Python"
!!! example "Executing each on-demand transformation function"
- ```python
- # Specify request parameters for each serving key.
- feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False, on_demand_features=False, return_type="pandas"
- )
-
- # Applying model dependent transformations
- feature_vector["on_demand_feature1"] = fv.on_demand_transformations[
- "on_demand_feature1"
- ](feature_vector["transaction_time"], datetime.now())
-
- ```
+ === "Python"
+
+ ```python
+ # Specify request parameters for each serving key.
+ feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1}, transform=False, on_demand_features=False, return_type="pandas"
+ )
+
+ # Applying model dependent transformations
+ feature_vector["on_demand_feature1"] = fv.on_demand_transformations[
+ "on_demand_feature1"
+ ](feature_vector["transaction_time"], datetime.now())
+ ```
diff --git a/docs/user_guides/fs/feature_view/batch-data.md b/docs/user_guides/fs/feature_view/batch-data.md
index c41b9611a..8c57cfa53 100644
--- a/docs/user_guides/fs/feature_view/batch-data.md
+++ b/docs/user_guides/fs/feature_view/batch-data.md
@@ -7,6 +7,7 @@ Feature views support batch prediction by returning batch data as a DataFrame ov
The resultant DataFrame (or batch-scoring DataFrame) can then be fed to models to make predictions.
=== "Python"
+
```python
# get batch data
df = feature_view.get_batch_data(
@@ -14,7 +15,9 @@ The resultant DataFrame (or batch-scoring DataFrame) can then be fed to models t
end_time = "20220627"
) # return a dataframe
```
+
=== "Java"
+
```java
Dataset ds = featureView.getBatchData("20220620", "20220627")
```
@@ -27,6 +30,7 @@ Primary key(s) and event time are not usually included in the feature view query
To retrieve the primary key(s) and/or event time when retrieving batch data for inference, you need to set the parameters `primary_key=True` and/or `event_time=True`.
=== "Python"
+
```python
# get batch data
df = feature_view.get_batch_data(
@@ -71,19 +75,21 @@ It is important to note that in addition to the filters defined in feature view,
By default, the `get_batch_data` function returns batch data with model-dependent transformations applied.
However, you can retrieve untransformed batch data—while still including on-demand features—by setting the `transform` parameter to `False`.
-=== "Python"
!!! example "Returning untransformed batch data"
- ```python
- # Fetching untransformed batch data.
- untransformed_batch_data = feature_view.get_batch_data(transform=False)
- ```
+ === "Python"
+
+ ```python
+ # Fetching untransformed batch data.
+ untransformed_batch_data = feature_view.get_batch_data(transform=False)
+ ```
## Passing Context Variables to Transformation Functions
After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the necessary context variables through the `transformation_context` parameter when fetching batch data.
-=== "Python"
- !!! example "Passing context variables while fetching batch data."
+!!! example "Passing context variables while fetching batch data."
+ === "Python"
+
```python
# Passing context variable to IN-MEMORY Training Dataset.
batch_data = feature_view.get_batch_data(transformation_context={"context_parameter":10})
diff --git a/docs/user_guides/fs/feature_view/feature-vectors.md b/docs/user_guides/fs/feature_view/feature-vectors.md
index 5a712a61e..a25bbb958 100644
--- a/docs/user_guides/fs/feature_view/feature-vectors.md
+++ b/docs/user_guides/fs/feature_view/feature-vectors.md
@@ -20,6 +20,7 @@ Alternative, you can provide the primary key of the feature groups as the key of
It is also possible to provide a subset of the entry, which will be discussed [below](#partial-feature-retrieval).
=== "Python"
+
```python
# get a single vector
feature_view.get_feature_vector(
@@ -35,7 +36,9 @@ It is also possible to provide a subset of the entry, which will be discussed [b
]
)
```
+
=== "Java"
+
```java
// get a single vector
Map entry1 = Maps.newHashMap();
@@ -87,14 +90,18 @@ In the example, it is 1 because `right_fg` is in the first join in the query `le
It can happen that some of the primary key entries are not available in some or all of the feature groups used by a feature view.
Take the above example assuming the feature view consists of two joined feature groups, first one with primary key column `pk1`, the second feature group with primary key column `pk2`.
+
=== "Python"
+
```python
# get a single vector
feature_view.get_feature_vector(
entry = {"pk1": 1, "pk2": 2}
)
```
+
=== "Java"
+
```java
// get a single vector
Map entry1 = Maps.newHashMap();
@@ -102,10 +109,13 @@ Take the above example assuming the feature view consists of two joined feature
entry1.put("pk2", 2);
featureView.getFeatureVector(entry1);
```
+
This call will raise an exception if `pk1 = 1` OR `pk2 = 2` can't be found but also if `pk1 = 1` AND `pk2 = 2` can't be found, meaning, it will not return a partial or empty feature vector.
When retrieving a batch of vectors, the behaviour is slightly different.
+
=== "Python"
+
```python
# get multiple vectors
feature_view.get_feature_vectors(
@@ -116,7 +126,9 @@ When retrieving a batch of vectors, the behaviour is slightly different.
]
)
```
+
=== "Java"
+
```java
// get multiple vectors
Map entry2 = Maps.newHashMap();
@@ -127,6 +139,7 @@ When retrieving a batch of vectors, the behaviour is slightly different.
entry3.put("pk2", 6);
featureView.getFeatureVectors(Lists.newArrayList(entry1, entry2, entry3));
```
+
This call will raise an exception if for example for the third entry `pk1 = 5` OR `pk2 = 6` can't be found, however, it will simply not return a vector for this entry if `pk1 = 5` AND `pk2 = 6`
can't be found.
That means, `get_feature_vectors` will never return partial feature vector, but will omit empty feature vectors.
@@ -140,6 +153,7 @@ In the example below, let's say you join 2 feature groups by `fg1.join(fg2, left
If `pk2` is not provided, this returns feature values from the first feature group and null values from the second feature group when using the option `allow_missing=True`, otherwise it raises exception.
=== "Python"
+
```python
# get a single vector with
feature_view.get_feature_vector(
@@ -165,6 +179,7 @@ Then you can follow the above examples and retrieve the feature vectors.
Please note that transformed feature vectors can only be returned in the python client but not in the java client.
=== "Python"
+
```python
feature_view.init_serving(training_dataset_version=1)
```
@@ -180,6 +195,7 @@ The feature view will apply the necessary transformations to the passed features
Please note that passed features is only available in the python client but not in the java client.
=== "Python"
+
```python
# get a single vector
feature_view.get_feature_vector(
@@ -206,6 +222,7 @@ You can also use the parameter to provide values for all the features which are
In this second case, you do not have to provide the primary key value for that feature group as no data needs to be retrieved from the online feature store.
=== "Python"
+
```python
# get a single vector, replace values from an entire feature group
# note how in this example you don't have to provide the value of
@@ -228,42 +245,45 @@ By default, the `get_feature_vector` and `get_feature_vectors` functions return
However, you can retrieve the untransformed feature vectors without applying model-dependent transformations while still including on-demand features by setting the `transform` parameter to False.
-=== "Python"
!!! example "Returning untransformed feature vectors"
- ```python
- # Fetching untransformed feature vector.
- untransformed_feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False
- )
+ === "Python"
- # Fetching untransformed feature vectors.
- untransformed_feature_vectors = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}], transform=False
- )
- ```
+ ```python
+ # Fetching untransformed feature vector.
+ untransformed_feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1}, transform=False
+ )
+
+ # Fetching untransformed feature vectors.
+ untransformed_feature_vectors = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}], transform=False
+ )
+ ```
## Retrieving feature vector without on-demand features
The `get_feature_vector` and `get_feature_vectors` methods can also return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features.
To achieve this, set the parameters `transform` and `on_demand_features` to `False`.
-=== "Python"
!!! example "Returning untransformed feature vectors"
- ```python
- untransformed_feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False, on_demand_features=False
- )
- untransformed_feature_vectors = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
- )
- ```
+ === "Python"
+
+ ```python
+ untransformed_feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1}, transform=False, on_demand_features=False
+ )
+ untransformed_feature_vectors = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
+ )
+ ```
## Passing Context Variables to Transformation Functions
After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the required context variables using the `transformation_context` parameter when fetching the feature vectors.
-=== "Python"
- !!! example "Passing context variables while fetching batch data."
+!!! example "Passing context variables while fetching batch data."
+ === "Python"
+
```python
# Passing context variable to IN-MEMORY Training Dataset.
batch_data = feature_view.get_feature_vectors(
diff --git a/docs/user_guides/fs/feature_view/feature_monitoring.md b/docs/user_guides/fs/feature_view/feature_monitoring.md
index 16c9362b3..674d228f2 100644
--- a/docs/user_guides/fs/feature_view/feature_monitoring.md
+++ b/docs/user_guides/fs/feature_view/feature_monitoring.md
@@ -39,6 +39,7 @@ In order to setup feature monitoring for a Feature View, you will need:
Connect the client running your notebooks to Hopsworks.
=== "Python"
+
```python3
import hopsworks
@@ -97,6 +98,7 @@ The following is a code example for creating a training dataset with two splits
You can setup statistics monitoring on a ==single feature or multiple features== of your Feature Group data, included in your Feature View query.
=== "Python"
+
```python3
# compute statistics for all the features
fg_monitoring_config = trans_fv.create_statistics_monitoring(
@@ -118,6 +120,7 @@ When enabling the comparison of statistics in a feature monitoring configuration
You can create multiple feature monitoring configurations on the same Feature View, but each of them should point to a single feature in the Feature View query.
=== "Python"
+
```python3
fg_monitoring_config = trans_fv.create_feature_monitoring(
name="trans_fv_amount_monitoring",
@@ -132,6 +135,7 @@ By default, the computation of statistics is scheduled to run endlessly, every d
You can modify the default schedule by adjusting the `cron_expression`, `start_date_time` and `end_date_time` parameters.
=== "Python"
+
```python3
fg_monitoring_config = trans_fv.create_statistics_monitoring(
name="trans_fv_all_features_monitoring",
@@ -157,6 +161,7 @@ You can define a different detection window using the `window_length` and `time_
Additionally, you can specify the percentage of feature data on which statistics will be computed using the `row_percentage` parameter.
=== "Python"
+
```python3
fm_monitoring_config.with_detection_window(
window_length="1w", # data ingested during one week
@@ -170,6 +175,7 @@ Additionally, you can specify the percentage of feature data on which statistics
When setting up feature monitoring for a Feature View, reference windows can be either a regular window, a specific value (i.e., window of size 1) or a training dataset.
=== "Python"
+
```python3
# compare statistics against a reference window
fm_monitoring_config.with_reference_window(
@@ -196,6 +202,7 @@ First, you select the metric to consider in the comparison using the `metric` pa
Then, you can define a relative or absolute threshold using the `threshold` and `relative` parameters.
=== "Python"
+
```python3
fm_monitoring_config.compare_on(
metric="mean",
@@ -214,6 +221,7 @@ Finally, you can save your feature monitoring configuration by calling the `save
Once the configuration is saved, the schedule for the statistics computation and comparison will be activated automatically.
=== "Python"
+
```python3
fm_monitoring_config.save()
```
diff --git a/docs/user_guides/fs/feature_view/helper-columns.md b/docs/user_guides/fs/feature_view/helper-columns.md
index 450ea0cdc..8e7497cba 100644
--- a/docs/user_guides/fs/feature_view/helper-columns.md
+++ b/docs/user_guides/fs/feature_view/helper-columns.md
@@ -22,9 +22,9 @@ In this use case `expiry_date` is an inference helper column.
It is not used for training but is necessary
for computing the [on-demand feature](../../../concepts/fs/feature_group/on_demand_feature.md)`days_valid` feature.
-=== "Python"
+!!! example "Define inference columns for feature views."
+ === "Python"
- !!! example "Define inference columns for feature views."
```python
# define query object
query = label_fg.select("fraud_label")\
@@ -48,9 +48,9 @@ However, they can be optionally fetched with inference or training data.
#### Batch inference
-=== "Python"
+!!! example "Fetch inference helper column values and compute on-demand features during batch inference."
+ === "Python"
- !!! example "Fetch inference helper column values and compute on-demand features during batch inference."
```python
# import feature functions
@@ -74,11 +74,10 @@ However, they can be optionally fetched with inference or training data.
#### Online inference
-=== "Python"
+!!! example "Fetch inference helper column values and compute on-demand features during online inference."
+ === "Python"
- !!! example "Fetch inference helper column values and compute on-demand features during online inference."
```python
-
from feature_functions import time_delta
# Fetch feature view object
@@ -104,9 +103,10 @@ However, they can be optionally fetched with inference or training data.
days_valid = time_delta(transaction_date, inference_helper['expiry_date'])
# Now get assembled feature vector for prediction
- feature_vector = feature_view.get_feature_vector({"cc_num": cc_num},
- passed_features={"days_valid": days_valid}
- )
+ feature_vector = feature_view.get_feature_vector(
+ {"cc_num": cc_num},
+ passed_features={"days_valid": days_valid},
+ )
```
## Training Helper columns
@@ -114,9 +114,9 @@ However, they can be optionally fetched with inference or training data.
`training_helper_columns` are a list of feature names that are not the part of the model schema itself but are used during training for the extra information.
For example one might want to use feature like `category` of the purchased product to assign different weights.
-=== "Python"
+!!! example "Define training helper columns for feature views."
+ === "Python"
- !!! example "Define training helper columns for feature views."
```python
# define query object
query = label_fg.select("fraud_label")\
@@ -138,11 +138,10 @@ For example one might want to use feature like `category` of the purchased produ
When retrieving training data helper columns will be omitted.
However, they can be optionally fetched.
-=== "Python"
+!!! example "Fetch training data with or without inference helper column values."
+ === "Python"
- !!! example "Fetch training data with or without inference helper column values."
```python
-
# import feature functions
from feature_functions import location_delta, time_delta
@@ -157,13 +156,13 @@ However, they can be optionally fetched.
X_train, X_test, y_train, y_test = feature_view.train_test_split(
description='transactions fraud training dataset',
test_size=TEST_SIZE,
- training_helper_columns=True
+ training_helper_columns=True
)
# Get existing training data with training helper columns
X_train, X_test, y_train, y_test = feature_view.get_train_test_split(
- training_dataset_version=1,
- training_helper_columns=True
+ training_dataset_version=1,
+ training_helper_columns=True
)
```
diff --git a/docs/user_guides/fs/feature_view/model-dependent-transformations.md b/docs/user_guides/fs/feature_view/model-dependent-transformations.md
index b16972b27..8e911dd47 100644
--- a/docs/user_guides/fs/feature_view/model-dependent-transformations.md
+++ b/docs/user_guides/fs/feature_view/model-dependent-transformations.md
@@ -25,9 +25,9 @@ For instance, for the function named `add_one_multiple` that outputs multiple c
The function named `add_two` that outputs a single column in the example given below, produces a single output column names as `add_two_feature`.
Additionally, Hopsworks also allows users to specify custom names for transformed feature using the [`alias`](../transformation_functions.md#specifying-output-features-names-for-transformation-functions) function.
-=== "Python"
+!!! example "Creating model-dependent transformation functions"
+ === "Python"
- !!! example "Creating model-dependent transformation functions"
```python
# Defining a many to many transformation function.
@udf(return_type=[int, int, int], drop=["feature1", "feature3"])
@@ -55,9 +55,9 @@ Additionally, Hopsworks also allows users to specify custom names for transforme
The features to be used by a model-dependent transformation function can be specified by providing the feature names (from the feature view / feature group) as input to the transformation functions.
-=== "Python"
+!!! example "Specifying input features to be passed to a model-dependent transformation function"
+ === "Python"
- !!! example "Specifying input features to be passed to a model-dependent transformation function"
```python
feature_view = fs.create_feature_view(
name='transactions_view',
@@ -76,9 +76,9 @@ The features to be used by a model-dependent transformation function can be spec
Built-in transformation functions are attached in the same way.
The only difference is that they can either be retrieved from the Hopsworks or imported from the `hopsworks` module.
-=== "Python"
+!!! example "Creating model-dependent transformation using built-in transformation functions retrieved from Hopsworks"
+ === "Python"
- !!! example "Creating model-dependent transformation using built-in transformation functions retrieved from Hopsworks"
```python
min_max_scaler = fs.get_transformation_function(name="min_max_scaler")
standard_scaler = fs.get_transformation_function(name="standard_scaler")
@@ -100,9 +100,9 @@ The only difference is that they can either be retrieved from the Hopsworks or i
To attach built-in transformation functions from the `hopsworks` module they can be directly imported into the code from `hopsworks.builtin_transformations`.
-=== "Python"
+!!! example "Creating model-dependent transformation using built-in transformation functions imported from hopsworks"
+ === "Python"
- !!! example "Creating model-dependent transformation using built-in transformation functions imported from hopsworks"
```python
from hopsworks.hsfs.builtin_transformations import min_max_scaler, label_encoder, robust_scaler, standard_scaler
@@ -127,9 +127,9 @@ The transformed features are organized by their output column names in alphabeti
Model-dependent transformation functions can also be manually applied to a feature vector using the `transform` function.
-=== "Python"
+!!! example "Manually applying model-dependent transformations during online inference"
+ === "Python"
- !!! example "Manually applying model-dependent transformations during online inference"
```python
# Initialize the feature view with the correct training dataset version used for model-dependent transformations
fv.init_serving(training_dataset_version)
@@ -146,21 +146,22 @@ Model-dependent transformation functions can also be manually applied to a featu
The `get_feature_vector`, `get_feature_vectors`, and `get_batch_data` methods can return untransformed feature vectors and batch data without applying model-dependent transformations while still including on-demand features.
To achieve this, set the `transform` parameter to False.
-=== "Python"
!!! example "Returning untransformed feature vectors and batch data."
- ```python
- # Fetching untransformed feature vector.
- untransformed_feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False
- )
-
- # Fetching untransformed feature vectors.
- untransformed_feature_vectors = feature_view.get_feature_vectors(
- entry=[{"id": 1}, {"id": 2}], transform=False
- )
-
- # Fetching untransformed batch data.
- untransformed_batch_data = feature_view.get_batch_data(
- transform=False
- )
- ```
+ === "Python"
+
+ ```python
+ # Fetching untransformed feature vector.
+ untransformed_feature_vector = feature_view.get_feature_vector(
+ entry={"id": 1}, transform=False
+ )
+
+ # Fetching untransformed feature vectors.
+ untransformed_feature_vectors = feature_view.get_feature_vectors(
+ entry=[{"id": 1}, {"id": 2}], transform=False
+ )
+
+ # Fetching untransformed batch data.
+ untransformed_batch_data = feature_view.get_batch_data(
+ transform=False
+ )
+ ```
diff --git a/docs/user_guides/fs/feature_view/overview.md b/docs/user_guides/fs/feature_view/overview.md
index aa66d8f72..0521b587f 100644
--- a/docs/user_guides/fs/feature_view/overview.md
+++ b/docs/user_guides/fs/feature_view/overview.md
@@ -61,10 +61,13 @@ To see a full example of how to create a feature view, you can read [this notebo
Once you have created a feature view, you can retrieve it by its name and version.
=== "Python"
+
```python
feature_view = fs.get_feature_view(name="transactions_view", version=1)
```
+
=== "Java"
+
```java
FeatureView featureView = featureStore.getFeatureView("transactions_view", 1)
```
@@ -75,10 +78,13 @@ If there are some feature view instances which you do not use anymore, you can d
It is important to mention that all training datasets (include all materialised hopsfs training data) will be deleted along with the feature view.
=== "Python"
+
```python
feature_view.delete()
```
+
=== "Java"
+
```java
featureView.delete()
```
@@ -90,6 +96,7 @@ You can attach, get, and remove tags.
You can learn more in [Tags Guide](../tags/tags.md).
=== "Python"
+
```python
# attach
feature_view.add_tag(name="tag_schema", value={"key": "value"})
@@ -100,7 +107,9 @@ You can learn more in [Tags Guide](../tags/tags.md).
#remove
feature_view.delete_tag(name="tag_schema")
```
+
=== "Java"
+
```java
// attach
Map tag = Maps.newHashMap();
diff --git a/docs/user_guides/fs/feature_view/query.md b/docs/user_guides/fs/feature_view/query.md
index 5683b4a75..6a2955819 100644
--- a/docs/user_guides/fs/feature_view/query.md
+++ b/docs/user_guides/fs/feature_view/query.md
@@ -11,6 +11,7 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg
The APIs allow you to specify which features to select from which feature group, how to join them and which features to use in join conditions.
=== "Python"
+
```python
fs = ...
credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1)
@@ -36,8 +37,8 @@ The APIs allow you to specify which features to select from which feature group,
```
=== "Scala"
- ```scala
+ ```scala
val fs = ...
val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1)
val accountDetailsFg = fs.getFeatureGroup(name="account_details", version=1)
@@ -71,6 +72,7 @@ Most operations performed on `FeatureGroup` metadata objects will return a `Quer
Selecting features from a feature group is a lazy operation, returning a query with the selected features only:
=== "Python"
+
```python
credit_card_transactions_fg = fs.get_feature_group("credit_card_transactions")
@@ -79,7 +81,8 @@ Selecting features from a feature group is a lazy operation, returning a query w
```
=== "Scala"
- ```Scala
+
+ ```scala
val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions")
# Returns Query
@@ -93,13 +96,15 @@ The simplest join in one where we join all of the features together from two dif
By default, Hopsworks will use the maximal matching subset of the primary keys of the two feature groups as joining key(s), if not specified otherwise.
=== "Python"
+
```python
# Returns Query
selected_features = credit_card_transactions_fg.join(account_details_fg)
```
=== "Scala"
- ```Scala
+
+ ```scala
// Returns Query
val selectedFeatures = creditCardTransactionsFg.join(accountDetailsFg)
```
@@ -111,6 +116,7 @@ features for the join key of the left and right feature group.
The join key lists should contain the names of the features to join on.
=== "Python"
+
```python
selected_features = credit_card_transactions_fg.select_all() \
.join(account_details_fg.select_all(), on=["cc_num"]) \
@@ -118,6 +124,7 @@ The join key lists should contain the names of the features to join on.
```
=== "Scala"
+
```scala
val selectedFeatures = (creditCardTransactionsFg.selectAll()
.join(accountDetailsFg.selectAll(), Seq("cc_num"))
@@ -146,6 +153,7 @@ foreign keys for its child feature groups.
=== "Python"
+
```python
selected_features = credit_card_transactions.select_all()
.join(aggregated_cc_transactions.select_all())
@@ -158,6 +166,7 @@ In online inference, when you want to retrieve features in your online model, yo
known as the serving_keys, from the parent feature group to retrieve your precomputed feature values using the feature view.
=== "Python"
+
```python
feature vector = feature_view.get_feature_vector({
‘cc_num’: “1234 5555 3333 8888”,
@@ -180,6 +189,7 @@ This is called Snowflake Schema data model where you need to build nested table
=== "Python"
+
```python
nested_selection = aggregated_cc_transactions.select_all()
.join(account_details.select_all())
@@ -193,6 +203,7 @@ This is called Snowflake Schema data model where you need to build nested table
Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group) to retrieve the precomputed features:
=== "Python"
+
```python
feature vector = feature_view.get_feature_vector({
‘cc_num’: “1234 5555 3333 8888”,
@@ -209,11 +220,13 @@ Bitwise Operators `&` and `|` are used to construct conjunctions.
For the Scala part of the API, equivalent methods are available in the `Feature` and `Filter` classes.
=== "Python"
+
```python
filtered_credit_card_transactions = credit_card_transactions_fg.filter(credit_card_transactions_fg.category == "Grocery")
```
=== "Scala"
+
```scala
val filteredCreditCardTransactions = creditCardTransactionsFg.filter(creditCardTransactionsFg.getFeature("category").eq("Grocery"))
```
@@ -221,6 +234,7 @@ For the Scala part of the API, equivalent methods are available in the `Feature`
Filters are fully compatible with joins:
=== "Python"
+
```python
selected_features = credit_card_transactions_fg.select_all() \
.join(account_details_fg.select_all(), on=["cc_num"]) \
@@ -229,6 +243,7 @@ Filters are fully compatible with joins:
```
=== "Scala"
+
```scala
val selectedFeatures = (creditCardTransactionsFg.selectAll()
.join(accountDetailsFg.selectAll(), Seq("cc_num"))
@@ -239,6 +254,7 @@ Filters are fully compatible with joins:
The filters can be applied at any point of the query:
=== "Python"
+
```python
selected_features = credit_card_transactions_fg.select_all() \
.join(accountDetails_fg.select_all().filter(accountDetails_fg.avg_temp >= 22), on=["cc_num"]) \
@@ -247,6 +263,7 @@ The filters can be applied at any point of the query:
```
=== "Scala"
+
```scala
val selectedFeatures = (creditCardTransactionsFg.selectAll()
.join(accountDetailsFg.selectAll().filter(accountDetailsFg.getFeature("avg_temp").ge(22)), Seq("cc_num"))
@@ -261,6 +278,7 @@ However, this operation will not update the metadata and persist the updated que
This query can then be used to create a new feature view.
=== "Python"
+
```python
fs = ...
merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1)
@@ -272,6 +290,7 @@ This query can then be used to create a new feature view.
```
=== "Scala"
+
```scala
val fs = ...
val merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
@@ -287,6 +306,7 @@ This query can then be used to create a new feature view.
To successfully apply new join/filter logic it is recommended to refresh the query instance by re-fetching the feature view:
=== "Python"
+
```python
fs = ...
@@ -311,6 +331,7 @@ This query can then be used to create a new feature view.
```
=== "Scala"
+
```scala
fs = ...
merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1)
diff --git a/docs/user_guides/fs/feature_view/training-data.md b/docs/user_guides/fs/feature_view/training-data.md
index 33c83faef..e8a0d46ad 100644
--- a/docs/user_guides/fs/feature_view/training-data.md
+++ b/docs/user_guides/fs/feature_view/training-data.md
@@ -121,21 +121,25 @@ Once you have [defined a transformation function using a context variable](../tr
!!! note
Passing context variables for materializing a training dataset is only supported in the PySpark Kernel.
-=== "Python"
- !!! example "Passing context variables while creating training data."
+!!! example "Passing context variables while creating training data."
+ === "Python"
+
```python
# Passing context variable to IN-MEMORY Training Dataset.
- X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1,
- primary_key=True,
- event_time=True,
- transformation_context={"context_parameter":10})
+ X_train, X_test, y_train, y_test = feature_view.get_train_test_split(
+ training_dataset_version=1,
+ primary_key=True,
+ event_time=True,
+ transformation_context={"context_parameter":10},
+ )
# Passing context variable to Materialized Training Dataset.
- version, job = feature_view.get_train_test_split(training_dataset_version=1,
- primary_key=True,
- event_time=True,
- transformation_context={"context_parameter":10})
-
+ version, job = feature_view.get_train_test_split(
+ training_dataset_version=1,
+ primary_key=True,
+ event_time=True,
+ transformation_context={"context_parameter":10},
+ )
```
## Read training data with primary key(s) and event time
@@ -146,9 +150,11 @@ To retrieve the primary key(s) and/or event time when retrieving training data,
```python
# get a training dataset
-X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1,
- primary_key=True,
- event_time=True)
+X_train, X_test, y_train, y_test = feature_view.get_train_test_split(
+ training_dataset_version=1,
+ primary_key=True,
+ event_time=True,
+)
```
!!! note
diff --git a/docs/user_guides/fs/transformation_functions.md b/docs/user_guides/fs/transformation_functions.md
index 5466e5c96..cefc6e020 100644
--- a/docs/user_guides/fs/transformation_functions.md
+++ b/docs/user_guides/fs/transformation_functions.md
@@ -73,9 +73,9 @@ Hopsworks supports four types of transformation functions across all execution m
To create a one-to-one transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a single Python type.
The transformation function should take one argument as input and return a Pandas Series.
-=== "Python"
+!!! example "Creation of a one-to-one transformation function in Hopsworks."
+ === "Python"
- !!! example "Creation of a one-to-one transformation function in Hopsworks."
```python
from hopsworks import udf
@@ -88,8 +88,9 @@ The transformation function should take one argument as input and return a Panda
The creation of many-to-one transformation functions is similar to that of a one-to-one transformation function, the only difference being that the transformation function accepts multiple features as input.
-=== "Python"
- !!! example "Creation of a many-to-one transformation function in Hopsworks."
+!!! example "Creation of a many-to-one transformation function in Hopsworks."
+ === "Python"
+
```python
from hopsworks import udf
@@ -103,8 +104,9 @@ The creation of many-to-one transformation functions is similar to that of a one
To create a one-to-many transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a list of Python types, and the transformation function should take one argument as input and return multiple features as a Pandas DataFrame.
The return types provided to the decorator must match the types of each column in the returned Pandas DataFrame.
-=== "Python"
- !!! example "Creation of a one-to-many transformation function in Hopsworks."
+!!! example "Creation of a one-to-many transformation function in Hopsworks."
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -118,8 +120,9 @@ The return types provided to the decorator must match the types of each column i
The creation of a many-to-many transformation function is similar to that of a one-to-many transformation function, the only difference being that the transformation function accepts multiple features as input.
-=== "Python"
- !!! example "Creation of a many-to-many transformation function in Hopsworks."
+!!! example "Creation of a many-to-many transformation function in Hopsworks."
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -134,14 +137,15 @@ The creation of a many-to-many transformation function is similar to that of a o
The `mode` parameter of the `@udf` decorator can be used to specify the execution mode of the transformation function.
It accepts three possible values `default`, `python` and `pandas`. Each mode is explained in more detail below:
-#### Default
+#### Default Mode
This execution mode assumes that the transformation function can be executed as either a Pandas UDF or a Python UDF.
It serves as the default mode used when the `mode` parameter is not specified.
In this mode, the transformation function is executed as a Pandas UDF during training and in the batch inference pipeline, while it operates as a Python UDF during online inference.
-=== "Python"
- !!! example "Creating a many to many transformations function using the default execution mode"
+!!! example "Creating a many to many transformations function using the default execution mode"
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -156,12 +160,13 @@ In this mode, the transformation function is executed as a Pandas UDF during tra
return feature1 + 2, feature2 + 2, feature3 + 2
```
-#### Python
+#### Python Mode
The transformation function can be configured to always execute as a Python UDF by setting the `mode` parameter of the `@udf` decorator to `python`.
-=== "Python"
- !!! example "Creating a many to many transformation function as a Python UDF"
+!!! example "Creating a many to many transformation function as a Python UDF"
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -171,12 +176,13 @@ The transformation function can be configured to always execute as a Python UDF
return feature1 + 1, feature2 + 1, feature3 + 1
```
-#### Pandas
+#### Pandas Mode
The transformation function can be configured to always execute as a Pandas UDF by setting the `mode` parameter of the `@udf` decorator to `pandas`.
-=== "Python"
- !!! example "Creating a many to many transformations function as a Pandas UDF"
+!!! example "Creating a many to many transformations function as a Pandas UDF"
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -197,8 +203,9 @@ The transformation function can be configured to always execute as a Pandas UDF
The `drop` parameter of the `@udf` decorator is used to drop specific columns in the input DataFrame after transformation. If any argument of the transformation function is passed to the `drop` parameter, then the column mapped to the argument is dropped after the transformation functions are applied.
In the example below, the columns mapped to the arguments `feature1` and `feature3` are dropped after the application of all transformation functions.
-=== "Python"
- !!! example "Specify arguments to drop after transformation"
+!!! example "Specify arguments to drop after transformation"
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -214,8 +221,9 @@ The [`TransformationFunction.alias`][hsfs.transformation_function.Transformation
Each name must be uniques and should be at-most 63 characters long.
If no name is provided via the `alias` function, Hopsworks generates default output feature names when [on-demand](./feature_group/on_demand_transformations.md) or [model-dependent](./feature_view/model-dependent-transformations.md) transformation functions are created.
-=== "Python"
- !!! example "Specifying output column names for transformation functions."
+!!! example "Specifying output column names for transformation functions."
+ === "Python"
+
```python
from hopsworks import udf
import pandas as pd
@@ -243,8 +251,9 @@ The `TransformationStatistics` instance contains separate objects with the sam
These objects encapsulate statistics related to the argument as instances of the class [`FeatureTransformationStatistics`][hsfs.transformation_statistics.FeatureTransformationStatistics].
Upon instantiation, instances of `FeatureTransformationStatistics` contain `None` values and are updated with the required statistics after the creation of a training dataset.
-=== "Python"
- !!! example "Creation of a transformation function in Hopsworks that uses training dataset statistics"
+!!! example "Creation of a transformation function in Hopsworks that uses training dataset statistics"
+ === "Python"
+
```python
from hopsworks import udf
from hopsworks.transformation_statistics import TransformationStatistics
@@ -262,8 +271,9 @@ The `context` keyword argument can be defined in a transformation function to ac
These variables contain common data used across transformation functions.
By including the context argument, you can pass the necessary data as a dictionary into the into the `context` argument of the transformation function during [training dataset creation](feature_view/training-data.md#passing-context-variables-to-transformation-functions) or [feature vector retrieval](feature_view/feature-vectors.md#passing-context-variables-to-transformation-functions) or [batch data retrieval](feature_view/batch-data.md#passing-context-variables-to-transformation-functions).
-=== "Python"
- !!! example "Creation of a transformation function in Hopsworks that accepts context variables"
+!!! example "Creation of a transformation function in Hopsworks that accepts context variables"
+ === "Python"
+
```python
from hopsworks import udf
@@ -277,9 +287,9 @@ By including the context argument, you can pass the necessary data as a dictiona
To save a transformation function to the feature store, use the function `create_transformation_function`. It creates a [`TransformationFunction`][hsfs.transformation_function.TransformationFunction] object which can then be saved by calling the save function.
The save function will throw an error if another transformation function with the same name and version is already saved in the feature store.
-=== "Python"
+!!! example "Register transformation function `add_one` in the Hopsworks feature store"
+ === "Python"
- !!! example "Register transformation function `add_one` in the Hopsworks feature store"
```python
plus_one_meta = fs.create_transformation_function(
transformation_function=add_one,
@@ -294,9 +304,9 @@ To retrieve all transformation functions from the feature store, use the functio
A specific transformation function can be retrieved using its `name` and `version` with the function `get_transformation_function`.
If only the `name` is provided, then the version will default to 1.
-=== "Python"
+!!! example "Retrieving transformation functions from the feature store"
+ === "Python"
- !!! example "Retrieving transformation functions from the feature store"
```python
# get all transformation functions
fs.get_transformation_functions()
diff --git a/docs/user_guides/migration/40_migration.md b/docs/user_guides/migration/40_migration.md
index 4bd5ef6e7..2b589b99f 100644
--- a/docs/user_guides/migration/40_migration.md
+++ b/docs/user_guides/migration/40_migration.md
@@ -43,6 +43,7 @@ With 4.0, On-Demand Transformation Functions are now better supported which has
The following is how transformation functions were used in previous versions of Hopsworks and the how transformation functions are used in the 4.0 release.
=== "Pre-4.0"
+
```python
#################################################
# Creating transformation function Hopsworks 3.8#
@@ -81,6 +82,7 @@ The following is how transformation functions were used in previous versions of
```
=== "4.0"
+
```python
#################################################
# Creating transformation function Hopsworks 4.0#
diff --git a/docs/user_guides/mlops/registry/frameworks/llm.md b/docs/user_guides/mlops/registry/frameworks/llm.md
index 9d7edf162..94c64862d 100644
--- a/docs/user_guides/mlops/registry/frameworks/llm.md
+++ b/docs/user_guides/mlops/registry/frameworks/llm.md
@@ -13,6 +13,7 @@ In this guide you will learn how to export a [Large Language Model (LLM)](https:
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -28,6 +29,7 @@ Download your base or fine-tuned LLM.
LLMs can typically be downloaded using the official frameworks provided by their creators (e.g., HuggingFace, Ollama, ...)
=== "Python"
+
```python
# Download LLM (e.g., using huggingface to download Llama-3.1-8B base model)
from huggingface_hub import snapshot_download
@@ -44,6 +46,7 @@ If necessary, fine-tune your LLM with an [instruction set](https://www.hopsworks
A LLM can be fine-tuned fully or using [Parameter Efficient Fine Tuning (PEFT)](https://www.hopsworks.ai/dictionary/parameter-efficient-fine-tuning-of-llms) methods such as LoRA or QLoRA.
=== "Python"
+
```python
# Fine-tune LLM using PEFT (LoRA, QLoRA) or other methods
model_dir = ...
@@ -55,6 +58,7 @@ Use the `ModelRegistry.llm.create_model(..)` function to register a model as LLM
Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to.
=== "Python"
+
```python
# Model evaluation metrics
metrics = {'f1-score': 0.8, 'perplexity': 31.62, 'bleu-score': 0.73}
diff --git a/docs/user_guides/mlops/registry/frameworks/python.md b/docs/user_guides/mlops/registry/frameworks/python.md
index a7e6ba89d..6b54454bf 100644
--- a/docs/user_guides/mlops/registry/frameworks/python.md
+++ b/docs/user_guides/mlops/registry/frameworks/python.md
@@ -13,6 +13,7 @@ In this guide you will learn how to export a generic Python model and register i
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -27,6 +28,7 @@ In this guide you will learn how to export a generic Python model and register i
Define your XGBoost model and run the training loop.
=== "Python"
+
```python
# Define a model
model = XGBClassifier()
@@ -40,6 +42,7 @@ Define your XGBoost model and run the training loop.
Export the XGBoost model to a directory on the local filesystem.
=== "Python"
+
```python
model_file = "model.json"
@@ -52,6 +55,7 @@ Use the `ModelRegistry.python.create_model(..)` function to register a model as
Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to.
=== "Python"
+
```python
# Model evaluation metrics
metrics = {'accuracy': 0.92}
diff --git a/docs/user_guides/mlops/registry/frameworks/skl.md b/docs/user_guides/mlops/registry/frameworks/skl.md
index 7969a3050..9edce3839 100644
--- a/docs/user_guides/mlops/registry/frameworks/skl.md
+++ b/docs/user_guides/mlops/registry/frameworks/skl.md
@@ -13,6 +13,7 @@ In this guide you will learn how to export a Scikit-learn model and register it
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -27,6 +28,7 @@ In this guide you will learn how to export a Scikit-learn model and register it
Define your Scikit-learn model and run the training loop.
=== "Python"
+
```python
# Define a model
iris_knn = KNeighborsClassifier(..)
@@ -39,6 +41,7 @@ Define your Scikit-learn model and run the training loop.
Export the Scikit-learn model to a directory on the local filesystem.
=== "Python"
+
```python
model_file = "skl_knn.pkl"
@@ -51,6 +54,7 @@ Use the `ModelRegistry.sklearn.create_model(..)` function to register a model as
Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to.
=== "Python"
+
```python
# Model evaluation metrics
metrics = {'accuracy': 0.92}
diff --git a/docs/user_guides/mlops/registry/frameworks/tch.md b/docs/user_guides/mlops/registry/frameworks/tch.md
index 7e0a2aa52..27551958b 100644
--- a/docs/user_guides/mlops/registry/frameworks/tch.md
+++ b/docs/user_guides/mlops/registry/frameworks/tch.md
@@ -13,6 +13,7 @@ In this guide you will learn how to export a Torch model and register it in the
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -27,6 +28,7 @@ In this guide you will learn how to export a Torch model and register it in the
Define your Torch model and run the training loop.
=== "Python"
+
```python
# Define the model architecture
class Net(nn.Module):
@@ -53,6 +55,7 @@ Define your Torch model and run the training loop.
Export the Torch model to a directory on the local filesystem.
=== "Python"
+
```python
model_dir = "./model"
@@ -65,6 +68,7 @@ Use the `ModelRegistry.torch.create_model(..)` function to register a model as a
Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to.
=== "Python"
+
```python
# Model evaluation metrics
metrics = {'accuracy': 0.92}
diff --git a/docs/user_guides/mlops/registry/frameworks/tf.md b/docs/user_guides/mlops/registry/frameworks/tf.md
index 5de153544..97070fda8 100644
--- a/docs/user_guides/mlops/registry/frameworks/tf.md
+++ b/docs/user_guides/mlops/registry/frameworks/tf.md
@@ -16,6 +16,7 @@ In this guide you will learn how to export a TensorFlow model and register it in
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -30,6 +31,7 @@ In this guide you will learn how to export a TensorFlow model and register it in
Define your TensorFlow model and run the training loop.
=== "Python"
+
```python
# Define a model
model = tf.keras.Sequential()
@@ -49,6 +51,7 @@ Define your TensorFlow model and run the training loop.
Export the TensorFlow model to a directory on the local filesystem.
=== "Python"
+
```python
model_dir = "./model"
@@ -61,6 +64,7 @@ Use the `ModelRegistry.tensorflow.create_model(..)` function to register a model
Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to.
=== "Python"
+
```python
# Model evaluation metrics
metrics = {'accuracy': 0.92}
diff --git a/docs/user_guides/mlops/registry/input_example.md b/docs/user_guides/mlops/registry/input_example.md
index 86908443f..cdfd5c2e7 100644
--- a/docs/user_guides/mlops/registry/input_example.md
+++ b/docs/user_guides/mlops/registry/input_example.md
@@ -15,6 +15,7 @@ Attaching an input example to your model will give other users a better understa
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -30,6 +31,7 @@ Generate an input example which corresponds to a valid input to your model.
Currently we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list` to be passed as input example.
=== "Python"
+
```python
import numpy as np
@@ -41,6 +43,7 @@ Currently we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list` to b
Set the `input_example` parameter in the `create_model` function and call `save()` to attaching it to the model and register it in the registry.
=== "Python"
+
```python
model = mr.tensorflow.create_model(name="mnist",
input_example=input_example)
diff --git a/docs/user_guides/mlops/registry/model_evaluation_images.md b/docs/user_guides/mlops/registry/model_evaluation_images.md
index cf166bb15..64d507c7c 100644
--- a/docs/user_guides/mlops/registry/model_evaluation_images.md
+++ b/docs/user_guides/mlops/registry/model_evaluation_images.md
@@ -16,6 +16,7 @@ By attaching model evaluation images to your versioned model, other users can be
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -30,6 +31,7 @@ By attaching model evaluation images to your versioned model, other users can be
Generate an image that visualizes model performance and evaluation metrics
=== "Python"
+
```python
import seaborn
from sklearn.metrics import confusion_matrix
@@ -63,6 +65,7 @@ Generate an image that visualizes model performance and evaluation metrics
Save the figure to a file with a common filename extension (for example, .png or .jpeg), and place it in a directory called `images` - a subdirectory of the model directory that is registered to Hopsworks.
=== "Python"
+
```python
# Specify the directory name for saving the model and related artifacts
model_dir = "./model"
diff --git a/docs/user_guides/mlops/registry/model_schema.md b/docs/user_guides/mlops/registry/model_schema.md
index e7c8fe5c5..98e6e98ac 100644
--- a/docs/user_guides/mlops/registry/model_schema.md
+++ b/docs/user_guides/mlops/registry/model_schema.md
@@ -15,6 +15,7 @@ Attaching a model schema to your model will give other users a better understand
### Step 1: Connect to Hopsworks
=== "Python"
+
```python
import hopsworks
@@ -30,6 +31,7 @@ Create a ModelSchema for your inputs and outputs by passing in an example that y
Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`.
=== "Python"
+
```python
# Import a Schema and ModelSchema definition
from hsml.utils.model_schema import ModelSchema
@@ -56,6 +58,7 @@ Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`.
Set the `model_schema` parameter in the `create_model` function and call `save()` to attaching it to the model and register it in the registry.
=== "Python"
+
```python
model = mr.tensorflow.create_model(name="mnist",
model_schema=model_schema)
diff --git a/docs/user_guides/mlops/serving/predictor.md b/docs/user_guides/mlops/serving/predictor.md
index 5f77bce97..049bfe852 100644
--- a/docs/user_guides/mlops/serving/predictor.md
+++ b/docs/user_guides/mlops/serving/predictor.md
@@ -171,6 +171,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott
### Step 2 (Optional): Implement a predictor script
=== "Predictor"
+
``` python
class Predictor():
@@ -184,7 +185,9 @@ Once you are done with the changes, click on `Create new deployment` at the bott
# Use the model to make predictions
# return self.model.predict(inputs)
```
+
=== "Async Predictor"
+
``` python
class Predictor():
@@ -201,7 +204,9 @@ Once you are done with the changes, click on `Create new deployment` at the bott
# Use the model to make predictions
# return self.model.predict(result)
```
+
=== "Predictor (vLLM deployments only)"
+
``` python
import os
from vllm import **version**, AsyncEngineArgs, AsyncLLMEngine
diff --git a/docs/user_guides/mlops/serving/rest-api.md b/docs/user_guides/mlops/serving/rest-api.md
index 7d3df7456..d355bf4aa 100644
--- a/docs/user_guides/mlops/serving/rest-api.md
+++ b/docs/user_guides/mlops/serving/rest-api.md
@@ -48,9 +48,9 @@ The request must be sent as a JSON object containing an `inputs` or `instances`
See [more information on the request format](https://kserve.github.io/website/docs/concepts/architecture/data-plane/v1-protocol#request-format).
An example for this is given below.
-=== "Python"
+!!! example "REST API example for Predictive Inference (Tensorflow or SkLearn or Python Serving)"
+ === "Python"
- !!! example "REST API example for Predictive Inference (Tensorflow or SkLearn or Python Serving)"
```python
import requests
@@ -77,9 +77,8 @@ An example for this is given below.
print(response.json())
```
-=== "Curl"
+ === "Curl"
- !!! example "REST API example for Predictive Inference (Tensorflow or SkLearn or Python Serving)"
```bash
curl -X POST "http://10.87.42.108/v1/models/fraud:predict" \
-H "Host: fraud.test.hopsworks.ai" \
diff --git a/docs/user_guides/mlops/serving/transformer.md b/docs/user_guides/mlops/serving/transformer.md
index 607734b20..b2606206f 100644
--- a/docs/user_guides/mlops/serving/transformer.md
+++ b/docs/user_guides/mlops/serving/transformer.md
@@ -113,6 +113,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott
### Step 2: Implement transformer script
=== "Transformer"
+
```python
class Transformer():
def __init__(self):
diff --git a/docs/user_guides/projects/opensearch/knn.md b/docs/user_guides/projects/opensearch/knn.md
index a1c91db57..442eeb758 100644
--- a/docs/user_guides/projects/opensearch/knn.md
+++ b/docs/user_guides/projects/opensearch/knn.md
@@ -17,6 +17,7 @@ In this guide, you will learn how to create a simple recommendation application,
### Step 1: Get the OpenSearch API
=== "Python"
+
```python
import hopsworks
@@ -28,6 +29,7 @@ In this guide, you will learn how to create a simple recommendation application,
### Step 2: Configure the opensearch-py client
=== "Python"
+
```python
from opensearchpy import OpenSearch
@@ -39,6 +41,7 @@ In this guide, you will learn how to create a simple recommendation application,
Create an index to use by calling `opensearch_api.get_project_index(..)`.
=== "Python"
+
```python
knn_index_name = opensearch_api.get_project_index("demo_knn_index")
@@ -68,6 +71,7 @@ Ingest 10 vectors in a bulk fashion to the index.
These vectors represent the list of vectors to calculate the similarity for.
=== "Python"
+
```python
from opensearchpy.helpers import bulk
import random
@@ -94,6 +98,7 @@ These vectors represent the list of vectors to calculate the similarity for.
Score the vector `[2.5, 3]` and find the 3 most similar vectors.
=== "Python"
+
```python
# Define the search request
query = {
From da51f430d7967d77e4adaecf3dc64f21b0d02754 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Wed, 25 Feb 2026 13:03:02 +0100
Subject: [PATCH 08/16] Improve examples formatting
---
docs/setup_installation/admin/roleChaining.md | 2 -
.../azure/getting_started.md | 1 -
docs/user_guides/fs/data_source/usage.md | 28 ++++---
docs/user_guides/fs/feature_group/create.md | 68 ++++++++--------
.../fs/feature_group/create_external.md | 31 ++++----
.../fs/feature_group/create_spine.md | 6 +-
.../fs/feature_group/data_types.md | 52 ++++++------
.../fs/feature_group/data_validation.md | 53 ++++++-------
.../feature_group/data_validation_advanced.md | 36 ++++-----
.../data_validation_best_practices.md | 37 +++++----
.../fs/feature_group/deprecation.md | 4 +-
.../fs/feature_group/feature_monitoring.md | 28 +++----
.../fs/feature_group/notification.md | 14 ++--
.../on_demand_transformations.md | 61 ++++++++------
.../online_ingestion_observability.md | 2 +-
.../fs/feature_group/statistics.md | 17 ++--
docs/user_guides/fs/feature_group/ttl.md | 1 +
.../feature_monitoring_advanced.md | 10 +--
.../fs/feature_monitoring/index.md | 1 -
.../user_guides/fs/feature_view/batch-data.md | 23 +++---
.../fs/feature_view/feature-vectors.md | 68 ++++++----------
.../fs/feature_view/feature_logging.md | 24 +++---
.../fs/feature_view/feature_monitoring.md | 40 +++++-----
.../fs/feature_view/helper-columns.md | 67 +++++++++++-----
.../model-dependent-transformations.md | 66 ++++++++++------
docs/user_guides/fs/feature_view/overview.md | 15 ++--
docs/user_guides/fs/feature_view/query.md | 55 ++++++++++---
.../fs/feature_view/spine-query.md | 22 +++---
.../fs/feature_view/training-data.md | 79 +++++++++++--------
docs/user_guides/fs/provenance/provenance.md | 22 +++---
docs/user_guides/fs/sharing/sharing.md | 6 +-
docs/user_guides/fs/tags/tags.md | 6 +-
.../fs/transformation_functions.md | 51 ++++++++----
.../integrations/databricks/api_key.md | 11 +--
.../integrations/databricks/configuration.md | 11 +--
.../integrations/emr/emr_configuration.md | 1 -
docs/user_guides/integrations/hdinsight.md | 12 +--
.../integrations/mlstudio_designer.md | 36 +++++----
.../integrations/mlstudio_notebooks.md | 20 ++---
docs/user_guides/integrations/python.md | 13 +--
docs/user_guides/integrations/spark.md | 13 +--
docs/user_guides/migration/40_migration.md | 13 +--
.../mlops/registry/frameworks/llm.md | 7 +-
.../mlops/registry/frameworks/python.md | 2 +-
.../mlops/registry/frameworks/skl.md | 2 +-
.../mlops/registry/frameworks/tch.md | 3 +-
.../mlops/registry/frameworks/tf.md | 2 +-
.../mlops/registry/input_example.md | 3 +-
.../mlops/registry/model_evaluation_images.md | 4 +-
.../mlops/registry/model_schema.md | 13 ++-
.../user_guides/mlops/serving/api-protocol.md | 4 +-
.../mlops/serving/deployment-state.md | 6 ++
docs/user_guides/mlops/serving/deployment.md | 6 ++
.../mlops/serving/inference-batcher.md | 20 ++---
.../mlops/serving/inference-logger.md | 19 +++--
docs/user_guides/mlops/serving/predictor.md | 41 ++++++----
docs/user_guides/mlops/serving/resources.md | 23 ++++--
docs/user_guides/mlops/serving/rest-api.md | 17 ++--
docs/user_guides/mlops/serving/transformer.md | 22 ++++--
.../mlops/serving/troubleshooting.md | 6 ++
docs/user_guides/projects/airflow/airflow.md | 33 ++++----
docs/user_guides/projects/git/clone_repo.md | 20 ++---
.../projects/git/configure_git_provider.md | 10 +--
.../projects/git/repository_actions.md | 4 -
.../user_guides/projects/jobs/notebook_job.md | 11 +--
docs/user_guides/projects/jobs/pyspark_job.md | 14 ++--
docs/user_guides/projects/jobs/python_job.md | 10 +--
docs/user_guides/projects/jobs/ray_job.md | 22 ++----
docs/user_guides/projects/jobs/spark_job.md | 10 +--
.../projects/jupyter/spark_notebook.md | 4 +-
.../projects/kafka/consume_messages.md | 8 +-
.../projects/kafka/create_schema.md | 23 +-----
.../projects/kafka/create_topic.md | 12 ++-
.../projects/kafka/produce_messages.md | 12 +--
.../projects/opensearch/connect.md | 4 -
docs/user_guides/projects/opensearch/knn.md | 29 ++-----
.../projects/python/custom_commands.md | 3 +-
77 files changed, 831 insertions(+), 724 deletions(-)
diff --git a/docs/setup_installation/admin/roleChaining.md b/docs/setup_installation/admin/roleChaining.md
index b796f0469..e1e80a899 100644
--- a/docs/setup_installation/admin/roleChaining.md
+++ b/docs/setup_installation/admin/roleChaining.md
@@ -29,13 +29,11 @@ For more details on how to create an IAM roles for Kubernetes service accounts s
```sh
account_id=$(aws sts get-caller-identity --query "Account" --output text)
oidc_provider=$(aws eks describe-cluster --name my-cluster --region $AWS_REGION --query "cluster.identity.oidc.issuer" --output text | sed -e "s/^https:\/\///")
-
```
```sh
export namespace=hopsworks
export service_account=my-service-account
-
```
```json
diff --git a/docs/setup_installation/azure/getting_started.md b/docs/setup_installation/azure/getting_started.md
index 2fa4dd09e..7c617d46d 100644
--- a/docs/setup_installation/azure/getting_started.md
+++ b/docs/setup_installation/azure/getting_started.md
@@ -203,7 +203,6 @@ hopsfs:
account: "STORAGE_ACCOUNT_NAME"
container: "STORAGE_ACCOUNT_CONTAINER_NAME"
identityClientId: "UA_IDENTITY_CLIENT_ID"
-
```
## Step 5: Deploy Hopsworks
diff --git a/docs/user_guides/fs/data_source/usage.md b/docs/user_guides/fs/data_source/usage.md
index 46b498809..232d14b65 100644
--- a/docs/user_guides/fs/data_source/usage.md
+++ b/docs/user_guides/fs/data_source/usage.md
@@ -19,11 +19,12 @@ We retrieve a data source simply by its unique name.
```python
import hopsworks
+
# Connect to the Hopsworks feature store
project = hopsworks.login()
feature_store = project.get_feature_store()
# Retrieve data source
- ds = feature_store.get_data_source('data_source_name')
+ ds = feature_store.get_data_source("data_source_name")
```
=== "Scala"
@@ -51,7 +52,9 @@ For data sources based on object/file storage such as AWS S3, ADLS, GCS, we set
```python
# read data into dataframe using path
- df = connector.read(data_format='data_format', path='fileScheme://bucket/path/')
+ df = connector.read(
+ data_format="data_format", path="fileScheme://bucket/path/"
+ )
```
=== "Scala"
@@ -117,7 +120,7 @@ For reading data streams, the Kafka Data Source supports reading a Kafka topic i
=== "PySpark"
```python
- df = connector.read_stream(topic='kafka_topic_name')
+ df = connector.read_stream(topic="kafka_topic_name")
```
## Creating an External Feature Group
@@ -134,14 +137,15 @@ Example for any data warehouse/SQL based external sources, we set the desired SQ
=== "PySpark"
```python
- ds.query="SELECT * FROM TABLE"
+ ds.query = "SELECT * FROM TABLE"
- fg = feature_store.create_external_feature_group(name="sales",
+ fg = feature_store.create_external_feature_group(
+ name="sales",
version=1,
description="Physical shop sales features",
- data_source = ds,
- primary_key=['ss_store_sk'],
- event_time='sale_date'
+ data_source=ds,
+ primary_key=["ss_store_sk"],
+ event_time="sale_date",
)
```
@@ -159,10 +163,10 @@ While calling the [Feature View](../../../concepts/fs/feature_view/fv_overview.m
```python
# materialise a training dataset
version, job = feature_view.create_training_data(
- description = 'describe training data',
- data_format = 'spark_data_format', # e.g., data_format = "parquet" or data_format = "csv"
- write_options = {"wait_for_job": False},
- data_source = ds
+ description="describe training data",
+ data_format="spark_data_format", # e.g., data_format = "parquet" or data_format = "csv"
+ write_options={"wait_for_job": False},
+ data_source=ds,
)
```
diff --git a/docs/user_guides/fs/feature_group/create.md b/docs/user_guides/fs/feature_group/create.md
index a6b27cc69..acbcea43c 100644
--- a/docs/user_guides/fs/feature_group/create.md
+++ b/docs/user_guides/fs/feature_group/create.md
@@ -28,14 +28,15 @@ Using the HSFS API you can execute:
=== "PySpark"
```python
- fg = feature_store.create_feature_group(name="weather",
+ fg = feature_store.create_feature_group(
+ name="weather",
version=1,
description="Weather Features",
online_enabled=True,
- primary_key=['location_id'],
- partition_key=['day'],
- event_time='event_time',
- time_travel_format='DELTA',
+ primary_key=["location_id"],
+ partition_key=["day"],
+ event_time="event_time",
+ time_travel_format="DELTA",
)
```
@@ -121,12 +122,18 @@ The code example shows the creation of an online-enabled feature group that stor
```python
fg = fs.create_feature_group(
- name='air_quality',
- description='Air Quality characteristics of each day',
+ name="air_quality",
+ description="Air Quality characteristics of each day",
version=1,
- primary_key=['city','date'],
+ primary_key=["city", "date"],
online_enabled=True,
- online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1', 'NDB_TABLE=PARTITION_BALANCE=FOR_RP_BY_LDM_X_2']}
+ online_config={
+ "table_space": "ts_1",
+ "online_comments": [
+ "NDB_TABLE=READ_BACKUP=1",
+ "NDB_TABLE=PARTITION_BALANCE=FOR_RP_BY_LDM_X_2",
+ ],
+ },
)
```
@@ -150,29 +157,31 @@ For Python environments, only the stream API is supported (stream=True).
=== "Python"
```python
- fg = feature_store.create_feature_group(name="weather",
+ fg = feature_store.create_feature_group(
+ name="weather",
version=1,
description="Weather Features",
online_enabled=True,
- primary_key=['location_id'],
- partition_key=['day'],
- event_time='event_time',
- time_travel_format='HUDI',
+ primary_key=["location_id"],
+ partition_key=["day"],
+ event_time="event_time",
+ time_travel_format="HUDI",
)
```
=== "PySpark"
```python
- fg = feature_store.create_feature_group(name="weather",
+ fg = feature_store.create_feature_group(
+ name="weather",
version=1,
description="Weather Features",
online_enabled=True,
- primary_key=['location_id'],
- partition_key=['day'],
- event_time='event_time',
- time_travel_format='HUDI',
- stream=True
+ primary_key=["location_id"],
+ partition_key=["day"],
+ event_time="event_time",
+ time_travel_format="HUDI",
+ stream=True,
)
```
@@ -221,10 +230,7 @@ For example, most commonly, filtering is done on the event time column of a feat
query = fg.select_all()
# create a simple feature view
-fv = fs.create_feature_view(
- name='transactions_view',
- query=query
-)
+fv = fs.create_feature_view(name="transactions_view", query=query)
# set up dates
start_time = "2022-01-01"
@@ -234,7 +240,7 @@ end_time = "2022-06-30"
version, job = fv.create_training_data(
start_time=start_time,
end_time=end_time,
- description='Description of a dataset',
+ description="Description of a dataset",
)
```
@@ -280,9 +286,9 @@ For example, the inserted dataframe (unique combination of partition key values)
!!! example "Default Hudi partitioning"
```python
write_options = {
- 'hoodie.bulkinsert.shuffle.parallelism': 5,
- 'hoodie.insert.shuffle.parallelism': 5,
- 'hoodie.upsert.shuffle.parallelism': 5
+ "hoodie.bulkinsert.shuffle.parallelism": 5,
+ "hoodie.insert.shuffle.parallelism": 5,
+ "hoodie.upsert.shuffle.parallelism": 5,
}
```
That means, using Spark, Hudi shuffles the data into five in-memory partitions, which each fill map to a task and finally a parquet file (see figure below).
@@ -305,9 +311,9 @@ If the inserted Dataframe contains multiple feature group partitions, the parque
You can change the write options on every insert, depending also on the size of the data you are writing:
```python
write_options = {
- 'hoodie.bulkinsert.shuffle.parallelism': 5,
- 'hoodie.insert.shuffle.parallelism': 5,
- 'hoodie.upsert.shuffle.parallelism': 5
+ "hoodie.bulkinsert.shuffle.parallelism": 5,
+ "hoodie.insert.shuffle.parallelism": 5,
+ "hoodie.upsert.shuffle.parallelism": 5,
}
fg.insert(df, write_options=write_options)
```
diff --git a/docs/user_guides/fs/feature_group/create_external.md b/docs/user_guides/fs/feature_group/create_external.md
index e0a779a5e..7b5e7c248 100644
--- a/docs/user_guides/fs/feature_group/create_external.md
+++ b/docs/user_guides/fs/feature_group/create_external.md
@@ -48,13 +48,14 @@ Once you have defined the metadata, you can
GROUP BY ss_store_sk, sales_date
"""
- fg = feature_store.create_external_feature_group(name="sales",
+ fg = feature_store.create_external_feature_group(
+ name="sales",
version=1,
description="Physical shop sales features",
query=query,
data_source=ds,
- primary_key=['ss_store_sk'],
- event_time='sale_date'
+ primary_key=["ss_store_sk"],
+ event_time="sale_date",
)
fg.save()
@@ -65,13 +66,14 @@ Once you have defined the metadata, you can
=== "Python"
```python
- fg = feature_store.create_external_feature_group(name="sales",
+ fg = feature_store.create_external_feature_group(
+ name="sales",
version=1,
description="Physical shop sales features",
data_format="parquet",
data_source=ds,
- primary_key=['ss_store_sk'],
- event_time='sale_date'
+ primary_key=["ss_store_sk"],
+ event_time="sale_date",
)
fg.save()
@@ -108,14 +110,15 @@ For an external feature group to be available online, during the creation of the
```python
external_fg = fs.create_external_feature_group(
- name="sales",
- version=1,
- description="Physical shop sales features",
- query=query,
- data_source=ds,
- primary_key=['ss_store_sk'],
- event_time='sale_date',
- online_enabled=True)
+ name="sales",
+ version=1,
+ description="Physical shop sales features",
+ query=query,
+ data_source=ds,
+ primary_key=["ss_store_sk"],
+ event_time="sale_date",
+ online_enabled=True,
+ )
external_fg.save()
# read from external storage and filter data to sync to online
diff --git a/docs/user_guides/fs/feature_group/create_spine.md b/docs/user_guides/fs/feature_group/create_spine.md
index 0efba91c2..75434fcfd 100644
--- a/docs/user_guides/fs/feature_group/create_spine.md
+++ b/docs/user_guides/fs/feature_group/create_spine.md
@@ -28,9 +28,9 @@ Additionally, apart from primary key and event time information, a Spark datafra
name="spine_transactions",
version=1,
description="Transaction data",
- primary_key=['cc_num'],
- event_time='datetime',
- dataframe=trans_df
+ primary_key=["cc_num"],
+ event_time="datetime",
+ dataframe=trans_df,
)
```
diff --git a/docs/user_guides/fs/feature_group/data_types.md b/docs/user_guides/fs/feature_group/data_types.md
index 2f8a1bfd3..c3def5411 100644
--- a/docs/user_guides/fs/feature_group/data_types.md
+++ b/docs/user_guides/fs/feature_group/data_types.md
@@ -170,7 +170,9 @@ The validation is enabled by default and can be disabled by setting below key wo
=== "Python"
```python
- feature_group.insert(df, validation_options={'online_schema_validation':False})
+ feature_group.insert(
+ df, validation_options={"online_schema_validation": False}
+ )
```
The most important validation checks or error messages are mentioned below along with possible corrective actions.
@@ -185,18 +187,18 @@ The most important validation checks or error messages are mentioned below along
```python
# Drop rows: assuming 'id' is the primary key column
- df = df.dropna(subset=['id'])
+ df = df.dropna(subset=["id"])
# For composite keys
- df = df.dropna(subset=['id1', 'id2'])
+ df = df.dropna(subset=["id1", "id2"])
# Data imputation: replace null values with incrementing last integer id
# existing max id
- max_id = df['id'].max()
+ max_id = df["id"].max()
# counter to generate new id
next_id = max_id + 1
# for each null id, assign the next id incrementally
- for idx in df[df['id'].isna()].index:
- df.loc[idx, 'id'] = next_id
+ for idx in df[df["id"].isna()].index:
+ df.loc[idx, "id"] = next_id
next_id += 1
```
@@ -209,7 +211,7 @@ The most important validation checks or error messages are mentioned below along
```python
# incrementing primary key upto the length of dataframe
- df['id'] = range(1, len(df) + 1)
+ df["id"] = range(1, len(df) + 1)
```
03. String length exceeded
@@ -224,7 +226,7 @@ The most important validation checks or error messages are mentioned below along
```python
max_length = 100
- df['text_column'] = df['text_column'].str.slice(0, max_length)
+ df["text_column"] = df["text_column"].str.slice(0, max_length)
```
- Another option is to simply [create new version of the feature group][hsfs.feature_store.FeatureStore.get_or_create_feature_group] and insert the dataframe.
@@ -238,19 +240,23 @@ The most important validation checks or error messages are mentioned below along
```python
import pandas as pd
+
# example dummy dataframe with the string column
- df = pd.DataFrame(columns=['id', 'string_col'])
+ df = pd.DataFrame(columns=["id", "string_col"])
from hsfs.feature import Feature
+
features = [
- Feature(name="id",type="bigint",online_type="bigint"),
- Feature(name="string_col",type="string",online_type="text")
+ Feature(name="id", type="bigint", online_type="bigint"),
+ Feature(name="string_col", type="string", online_type="text"),
]
- fg = fs.get_or_create_feature_group(name="fg_manual_text_schema",
- version=1,
- features=features,
- online_enabled=True,
- primary_key=['id'])
+ fg = fs.get_or_create_feature_group(
+ name="fg_manual_text_schema",
+ version=1,
+ features=features,
+ online_enabled=True,
+ primary_key=["id"],
+ )
fg.insert(df)
```
@@ -291,13 +297,13 @@ You can explicitly define the feature group schema as follows:
from hsfs.feature import Feature
features = [
- Feature(name="id",type="int",online_type="int"),
- Feature(name="name",type="string",online_type="varchar(20)")
+ Feature(name="id", type="int", online_type="int"),
+ Feature(name="name", type="string", online_type="varchar(20)"),
]
- fg = fs.create_feature_group(name="fg_manual_schema",
- features=features,
- online_enabled=True)
+ fg = fs.create_feature_group(
+ name="fg_manual_schema", features=features, online_enabled=True
+ )
fg.save(features)
```
@@ -312,8 +318,8 @@ Adding additional features to an existing feature group is not considered a brea
from hsfs.feature import Feature
features = [
- Feature(name="id",type="int",online_type="int"),
- Feature(name="name",type="string",online_type="varchar(20)")
+ Feature(name="id", type="int", online_type="int"),
+ Feature(name="name", type="string", online_type="varchar(20)"),
]
fg = fs.get_feature_group(name="example", version=1)
diff --git a/docs/user_guides/fs/feature_group/data_validation.md b/docs/user_guides/fs/feature_group/data_validation.md
index f2a37c65d..1d0e370c6 100644
--- a/docs/user_guides/fs/feature_group/data_validation.md
+++ b/docs/user_guides/fs/feature_group/data_validation.md
@@ -109,7 +109,7 @@ In order to define and validate an expectation when writing to a Feature Group,
Connect the client running your notebooks to Hopsworks.
-```python3
+```python
import hopsworks
project = hopsworks.login()
@@ -124,10 +124,13 @@ The `fs` Feature Store entity is now ready to be used to insert or read data fro
Load your data in a DataFrame using the usual pandas API.
-```python3
+```python
import pandas as pd
-df = pd.read_csv("https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv", parse_dates=["datetime"])
+df = pd.read_csv(
+ "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv",
+ parse_dates=["datetime"],
+)
df.head(3)
```
@@ -143,7 +146,7 @@ Everything is done using the Great Expectations API so you can re-use any prior
Create (or import an existing) expectation suite using the Great Expectations library.
This suite will hold all the validation tests we want to perform on our data before inserting them into Hopsworks.
-```python3
+```python
import great_expectations as ge
expectation_suite = ge.core.ExpectationSuite(
@@ -156,26 +159,18 @@ expectation_suite = ge.core.ExpectationSuite(
Add some expectation to your suite.
Each expectation configuration corresponds to a validation test to be run against your data.
-```python3
+```python
expectation_suite.add_expectation(
ge.core.ExpectationConfiguration(
expectation_type="expect_column_min_to_be_between",
- kwargs={
- "column": "foo_id",
- "min_value": 0,
- "max_value": 1
- }
+ kwargs={"column": "foo_id", "min_value": 0, "max_value": 1},
)
)
expectation_suite.add_expectation(
ge.core.ExpectationConfiguration(
expectation_type="expect_column_value_lengths_to_be_between",
- kwargs={
- "column": "bar_name",
- "min_value": 3,
- "max_value": 10
- }
+ kwargs={"column": "bar_name", "min_value": 3, "max_value": 10},
)
)
```
@@ -185,7 +180,7 @@ expectation_suite.add_expectation(
Building Expectation Suite by hand can be a major time commitment when you have dozens of Features.
Great Expectations offers `Profiler` classes to inspect a sample of your data and infers a suitable Expectation Suite that you will be able to register with Hopsworks.
-```python3
+```python
ge_profiler = ge.profile.BasicSuiteBuilderProfiler()
expectation_suite_profiler, _ = ge_profiler.profile(ge.from_pandas(df))
```
@@ -199,20 +194,20 @@ Once a Feature Group is registered in the Feature Store, you can use it to inser
For more information see [create Feature Group](create.md).
To benefit from automatic validation on insertion, attach your newly created Expectation Suite when creating the Feature Group:
-```python3
+```python
fg = fs.create_feature_group(
- "fg_with_data_validation",
- version=1,
- description="Validated data",
- primary_key=['foo_id'],
- online_enabled=False,
- expectation_suite=expectation_suite
+ "fg_with_data_validation",
+ version=1,
+ description="Validated data",
+ primary_key=["foo_id"],
+ online_enabled=False,
+ expectation_suite=expectation_suite,
)
```
or, if the Feature Group already exist, you can simply run:
-```python3
+```python
fg.save_expectation_suite(expectation_suite)
```
@@ -220,7 +215,7 @@ That is all there is to it.
Hopsworks will now automatically use your suite to validate the DataFrames you want to write to the Feature Group.
Try it out!
-```python3
+```python
job, validation_report = fg.insert(df.head(5))
```
@@ -242,7 +237,7 @@ As you can see, your Feature Group conveniently gather all in one place: your da
Hopsworks client API allows you to retrieve validation reports for further analysis.
-```python3
+```python
# load multiple reports
validation_reports = fg.get_validation_reports()
@@ -252,10 +247,8 @@ ge_latest_report = fg.get_latest_validation_report()
Similarly you can retrieve the historic of validation results for a particular expectation, e.g to plot a time-series of a given expectation observed value over time.
-```python3
-validation_history = fg.get_validation_history(
- expectationId=1
-)
+```python
+validation_history = fg.get_validation_history(expectationId=1)
```
You can find the expectationIds in the UI or using `fg.get_expectation_suite` and looking it up in the expectation's meta field.
diff --git a/docs/user_guides/fs/feature_group/data_validation_advanced.md b/docs/user_guides/fs/feature_group/data_validation_advanced.md
index e3526a41a..86c63e731 100644
--- a/docs/user_guides/fs/feature_group/data_validation_advanced.md
+++ b/docs/user_guides/fs/feature_group/data_validation_advanced.md
@@ -23,8 +23,8 @@ Go to the Feature Group edit page, in the Expectation section you can choose bet
#### Validation Ingestion Policy in Python
-```python3
-fg.expectation_suite.validation_ingestion_policy = "ALWAYS" # "STRICT"
+```python
+fg.expectation_suite.validation_ingestion_policy = "ALWAYS" # "STRICT"
```
If your suite is registered with Hopsworks, it will persist the change to the server.
@@ -44,15 +44,15 @@ This will be used as the default option but can be overridden via the API.
To disable data validation until further notice in the API, you can update the `run_validation` field of the expectation suite.
If your suite is registered with Hopsworks, this will persist the change to the server.
-```python3
+```python
fg.expectation_suite.run_validation = False
```
If you wish to override the default behaviour of the suite when inserting data in the Feature Group, you can do so via the `validate_options` kwarg.
The example below will enable validation for this insertion only.
-```python3
-fg.insert(df_to_validate, validation_options={"run_validation" : True})
+```python
+fg.insert(df_to_validate, validation_options={"run_validation": True})
```
We recommend to avoid using this option in scheduled job as it silently changes the expected behaviour that is displayed in the UI and prevents changes to the default behaviour to change the behaviour of the job.
@@ -79,20 +79,18 @@ Note that you must have inserted data in the FG and attached the expectation sui
Get an expectation with a given id:
-```python3
+```python
my_expectation = fg.expectation_suite.get_expectation(
- expectation_id = my_expectation_id
+ expectation_id=my_expectation_id
)
```
Add a new expectation:
-```python3
+```python
new_expectation = ge.core.ExpectationConfiguration(
expectation_type="expect_column_values_not_to_be_null",
- kwargs={
- "mostly": 1
- }
+ kwargs={"mostly": 1},
)
fg.expectation_suite.add_expectation(new_expectation)
@@ -100,7 +98,7 @@ fg.expectation_suite.add_expectation(new_expectation)
Edit expectation kwargs of an existing expectation :
-```python3
+```python
existing_expectation = fg.expectation_suite.get_expectation(
expectation_id=existing_expectation_id
)
@@ -112,7 +110,7 @@ fg.expectation_suite.replace_expectation(existing_expectation)
Remove an expectation:
-```python3
+```python
fg.expectation_suite.remove_expectation(
expectation_id=id_of_expectation_to_delete
)
@@ -120,7 +118,7 @@ fg.expectation_suite.remove_expectation(
If you want to deal only with the Great Expectation API:
-```python3
+```python
my_suite = fg.get_expectation_suite()
my_suite.add_expectation(new_expectation)
@@ -139,7 +137,7 @@ The UI does not currently support upload of a validation report.
#### Save Validation Reports in Python
-```python3
+```python
fg.save_validation_report(ge_report)
```
@@ -155,7 +153,7 @@ One tab allows you to check the report history with general information, while t
#### Monitor and Fetch Validation Reports in Python
-```python3
+```python
# convenience method for rapid development
ge_latest_report = fg.get_latest_validation_report()
# fetching the latest summary prints a link to the UI
@@ -178,7 +176,7 @@ The button will launch a job which will read the Feature Group data, run validat
#### Validate Your Data Manually in Python
-```python3
+```python
ge_report = fg.validate(df, ingestion_result="EXPERIMENT")
# set the save_report parameter to False to skip uploading the report to Hopsworks
@@ -188,13 +186,13 @@ ge_report = fg.validate(df, ingestion_result="EXPERIMENT")
If you want to apply validation to the data already in the Feature Group you can call the `.validate` without providing data.
It will read the data in the Feature Group.
-```python3
+```python
report = fg.validate()
```
As validation objects returned by Hopsworks are native Great Expectation objects you can run validation using the usual Great Expectations syntax:
-```python3
+```python
ge_df = ge.from_pandas(df, expectation_suite=fg.get_expectation_suite())
ge_report = ge_df.validate()
```
diff --git a/docs/user_guides/fs/feature_group/data_validation_best_practices.md b/docs/user_guides/fs/feature_group/data_validation_best_practices.md
index 64d36fa2d..9274ebebe 100644
--- a/docs/user_guides/fs/feature_group/data_validation_best_practices.md
+++ b/docs/user_guides/fs/feature_group/data_validation_best_practices.md
@@ -15,13 +15,17 @@ As often with data validation, the best piece of advice is to set it up early in
Use this phase to build a history you can then use when it becomes time to set quality requirements for a project in production.
We made a code snippet to help you get started quickly:
-```python3
+```python
# Load sample data.
# Replace it with your own!
-my_data_df = pd.read_csv("https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv")
+my_data_df = pd.read_csv(
+ "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv"
+)
# Use Great Expectation profiler (ignore deprecation warning)
-expectation_suite_profiled, validation_report = ge.from_pandas(my_data_df).profile(profiler=ge.profile.BasicSuiteBuilderProfiler)
+expectation_suite_profiled, validation_report = ge.from_pandas(
+ my_data_df
+).profile(profiler=ge.profile.BasicSuiteBuilderProfiler)
# Create a Feature Group on hopsworks with an expectation suite attached.
# Don't forget to change the primary key!
@@ -29,13 +33,14 @@ my_validated_data_fg = fs.get_or_create_feature_group(
name="my_validated_data_fg",
version=1,
description="My data",
- primary_key=['cc_num'],
- expectation_suite=expectation_suite_profiled)
+ primary_key=["cc_num"],
+ expectation_suite=expectation_suite_profiled,
+)
```
Any data you insert in the Feature Group from now will be validated and a report will be uploaded to Hopsworks.
-```python3
+```python
# Insert and validate your data
insert_job, validation_report = my_validated_data_fg.insert(my_data_df)
```
@@ -73,10 +78,8 @@ Below are some simple tips and snippets to make the most of your data validation
Whether you use an existing or create a new (recommended) Feature Group for production, we recommend you set the validation ingestion policy of your Expectation Suite to `"STRICT"`.
-```python3
-fg_prod.save_expectation_suite(
- my_suite,
- validation_ingestion_policy="STRICT")
+```python
+fg_prod.save_expectation_suite(my_suite, validation_ingestion_policy="STRICT")
```
In this setup, Hopsworks will abort inserting a DataFrame that does not successfully fulfill all expectations in the attached Expectation Suite.
@@ -87,7 +90,7 @@ This ensures data quality standards are upheld for every insertion and provide d
Aborting insertions of DataFrames which do not satisfy the data quality standards can lead to data loss in your materialization job.
To avoid such loss we recommend creating a duplicate Feature Group with the same Expectation Suite in `"ALWAYS"` mode which will hold the rejected data.
-```python3
+```python
job, report = fg_prod.insert(df)
if report["success"] is False:
@@ -99,17 +102,17 @@ if report["success"] is False:
You can easily retrieve the validation history of a specific expectation to export it to your favourite visualisation tool.
You can filter on time and on whether insertion was successful or not.
-```python3
+```python
validation_history = fg.get_validation_history(
- expectation_id=my_id,
- filters=["REJECTED", "UNKNOWN"],
- ge_type=False
+ expectation_id=my_id, filters=["REJECTED", "UNKNOWN"], ge_type=False
)
timeseries = pd.DataFrame(
{
- "observed_value": [res.result["observed_value"] for res in validation_history],
- "validation_time": [res.validation_time for res in validation_history]
+ "observed_value": [
+ res.result["observed_value"] for res in validation_history
+ ],
+ "validation_time": [res.validation_time for res in validation_history],
}
)
diff --git a/docs/user_guides/fs/feature_group/deprecation.md b/docs/user_guides/fs/feature_group/deprecation.md
index 531c11a3b..c0b11079d 100644
--- a/docs/user_guides/fs/feature_group/deprecation.md
+++ b/docs/user_guides/fs/feature_group/deprecation.md
@@ -25,7 +25,9 @@ To deprecate a feature group using the HSFS APIs you need to provide a [Feature
=== "Python"
```python
- fg = fs.get_feature_group(name="feature_group_name", version=feature_group_version)
+ fg = fs.get_feature_group(
+ name="feature_group_name", version=feature_group_version
+ )
```
### Deprecate Feature Group
diff --git a/docs/user_guides/fs/feature_group/feature_monitoring.md b/docs/user_guides/fs/feature_group/feature_monitoring.md
index 509d741c9..ccb8201e3 100644
--- a/docs/user_guides/fs/feature_group/feature_monitoring.md
+++ b/docs/user_guides/fs/feature_group/feature_monitoring.md
@@ -39,7 +39,7 @@ Connect the client running your notebooks to Hopsworks.
=== "Python"
- ```python3
+ ```python
import hopsworks
project = hopsworks.login()
@@ -60,7 +60,7 @@ The following is a code example for getting or creating a Feature Group with nam
=== "Python"
- ```python3
+ ```python
# Retrieve an existing feature group
trans_fg = fs.get_feature_group("trans_fg", version=1)
@@ -83,7 +83,7 @@ You can setup statistics monitoring on a ==single feature or multiple features==
=== "Python"
- ```python3
+ ```python
# compute statistics for all the features
fg_monitoring_config = trans_fg.create_statistics_monitoring(
name="trans_fg_all_features_monitoring",
@@ -105,7 +105,7 @@ You can create multiple feature monitoring configurations for the same Feature G
=== "Python"
- ```python3
+ ```python
fg_monitoring_config = trans_fg.create_feature_monitoring(
name="trans_fg_amount_monitoring",
feature_name="amount",
@@ -120,12 +120,12 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d
=== "Python"
- ```python3
+ ```python
fg_monitoring_config = trans_fg.create_statistics_monitoring(
name="trans_fg_all_features_monitoring",
description="Compute statistics on all data of all features of the Feature Group on a weekly basis",
cron_expression="0 0 12 ? * MON *", # weekly
- row_percentage=0.8, # use 80% of the data
+ row_percentage=0.8, # use 80% of the data
)
# or
@@ -134,7 +134,7 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d
feature_name="amount",
description="Compute descriptive statistics on the amount Feature of the Feature Group on a weekly basis",
cron_expression="0 0 12 ? * MON *", # weekly
- row_percentage=0.8, # use 80% of the data
+ row_percentage=0.8, # use 80% of the data
)
```
@@ -146,10 +146,10 @@ Additionally, you can specify the percentage of feature data on which statistics
=== "Python"
- ```python3
+ ```python
fm_monitoring_config.with_detection_window(
window_length="1w", # data ingested during one week
- time_offset="1w", # starting from last week
+ time_offset="1w", # starting from last week
row_percentage=0.8, # use 80% of the data
)
```
@@ -160,11 +160,11 @@ When setting up feature monitoring for a Feature Group, reference windows can be
=== "Python"
- ```python3
+ ```python
# compare statistics against a reference window
fm_monitoring_config.with_reference_window(
window_length="1w", # data ingested during one week
- time_offset="2w", # starting from two weeks ago
+ time_offset="2w", # starting from two weeks ago
row_percentage=0.8, # use 80% of the data
)
@@ -182,12 +182,12 @@ Then, you can define a relative or absolute threshold using the `threshold` and
=== "Python"
- ```python3
+ ```python
fm_monitoring_config.compare_on(
metric="mean",
threshold=0.2, # a relative change over 20% is considered anomalous
relative=True, # relative or absolute change
- strict=False, # strict or relaxed comparison
+ strict=False, # strict or relaxed comparison
)
```
@@ -201,7 +201,7 @@ Once the configuration is saved, the schedule for the statistics computation and
=== "Python"
- ```python3
+ ```python
fm_monitoring_config.save()
```
diff --git a/docs/user_guides/fs/feature_group/notification.md b/docs/user_guides/fs/feature_group/notification.md
index 27402ed39..b3d2010c6 100644
--- a/docs/user_guides/fs/feature_group/notification.md
+++ b/docs/user_guides/fs/feature_group/notification.md
@@ -26,11 +26,12 @@ To enable Change Data Capture for an online-enabled feature group using the HSFS
```python
fg = fs.create_feature_group(
- name="feature_group_name",
- version=feature_group_version,
- primary_key=feature_group_primary_keys,
- online_enabled=True,
- notification_topic_name="notification_topic_name")
+ name="feature_group_name",
+ version=feature_group_version,
+ primary_key=feature_group_primary_keys,
+ online_enabled=True,
+ notification_topic_name="notification_topic_name",
+ )
```
### Update Feature Group with Change Data Capture topic using Python
@@ -43,7 +44,8 @@ With the default configuration, it can take up to 30 minutes for these changes t
```python
fg.update_notification_topic_name(
- notification_topic_name="new_notification_topic_name")
+ notification_topic_name="new_notification_topic_name"
+ )
```
## Using UI
diff --git a/docs/user_guides/fs/feature_group/on_demand_transformations.md b/docs/user_guides/fs/feature_group/on_demand_transformations.md
index 57cdfac06..9eadc8c45 100644
--- a/docs/user_guides/fs/feature_group/on_demand_transformations.md
+++ b/docs/user_guides/fs/feature_group/on_demand_transformations.md
@@ -27,19 +27,22 @@ If no feature names are provided, the transformation function will default to us
def transaction_age(transaction_date, current_date):
return (current_date - transaction_date).dt.days
+
@hopsworks.udf(return_type=[str, str], drop=["current_date"])
def stripped_strings(country, city):
return country.strip(), city.strip()
+
# Attach transformation function to feature group to create on-demand transformation function.
- fg = feature_store.create_feature_group(name="fg_transactions",
- version=1,
- description="Transaction Features",
- online_enabled=True,
- primary_key=['id'],
- event_time='event_time',
- transformation_functions=[transaction_age, stripped_strings]
- )
+ fg = feature_store.create_feature_group(
+ name="fg_transactions",
+ version=1,
+ description="Transaction Features",
+ online_enabled=True,
+ primary_key=["id"],
+ event_time="event_time",
+ transformation_functions=[transaction_age, stripped_strings],
+ )
```
### Specifying input features
@@ -50,14 +53,17 @@ The features to be used by the on-demand transformation function can be specifie
=== "Python"
```python
- fg = feature_store.create_feature_group(name="fg_transactions",
- version=1,
- description="Transaction Features",
- online_enabled=True,
- primary_key=['id'],
- event_time='event_time',
- transformation_functions=[age_transaction('transaction_time', 'current_time')]
- )
+ fg = feature_store.create_feature_group(
+ name="fg_transactions",
+ version=1,
+ description="Transaction Features",
+ online_enabled=True,
+ primary_key=["id"],
+ event_time="event_time",
+ transformation_functions=[
+ age_transaction("transaction_time", "current_time")
+ ],
+ )
```
## Usage
@@ -84,17 +90,19 @@ These on-demand features are equivalent to regular features, and [model-dependen
```python
# Selecting on-demand features in query
- query = fg.select(["id", "feature1", "feature2", "on_demand_feature3", "on_demand_feature4"])
+ query = fg.select(
+ ["id", "feature1", "feature2", "on_demand_feature3", "on_demand_feature4"]
+ )
# Creating a feature view using a query that contains on-demand transformations and model-dependent transformations
feature_view = fs.create_feature_view(
- name='transactions_view',
- query=query,
- transformation_functions=[
- min_max_scaler("feature1"),
- min_max_scaler("on_demand_feature3"),
- ]
- )
+ name="transactions_view",
+ query=query,
+ transformation_functions=[
+ min_max_scaler("feature1"),
+ min_max_scaler("on_demand_feature3"),
+ ],
+ )
```
### Computing on-demand features
@@ -251,7 +259,10 @@ On-demand transformation functions can also be accessed and executed as normal f
```python
# Specify request parameters for each serving key.
feature_vector = feature_view.get_feature_vector(
- entry={"id": 1}, transform=False, on_demand_features=False, return_type="pandas"
+ entry={"id": 1},
+ transform=False,
+ on_demand_features=False,
+ return_type="pandas",
)
# Applying model dependent transformations
diff --git a/docs/user_guides/fs/feature_group/online_ingestion_observability.md b/docs/user_guides/fs/feature_group/online_ingestion_observability.md
index 08305c6fe..734b3e120 100644
--- a/docs/user_guides/fs/feature_group/online_ingestion_observability.md
+++ b/docs/user_guides/fs/feature_group/online_ingestion_observability.md
@@ -28,7 +28,7 @@ First, create an online-enabled feature group and insert data into it:
name="feature_group_name",
version=feature_group_version,
primary_key=feature_group_primary_keys,
- online_enabled=True
+ online_enabled=True,
)
fg.insert(fg_df)
diff --git a/docs/user_guides/fs/feature_group/statistics.md b/docs/user_guides/fs/feature_group/statistics.md
index a2543a044..cad56e2a4 100644
--- a/docs/user_guides/fs/feature_group/statistics.md
+++ b/docs/user_guides/fs/feature_group/statistics.md
@@ -45,20 +45,21 @@ By default the value is empty list `[]` and the statistics are computed for all
=== "Python"
```python
- fg = feature_store.create_feature_group(name="weather",
+ fg = feature_store.create_feature_group(
+ name="weather",
version=1,
description="Weather Features",
online_enabled=True,
- primary_key=['location_id'],
- partition_key=['day'],
- event_time='event_time',
+ primary_key=["location_id"],
+ partition_key=["day"],
+ event_time="event_time",
statistics_config={
"enabled": True,
"histograms": True,
"correlations": True,
"exact_uniqueness": False,
- "columns": []
- }
+ "columns": [],
+ },
)
```
@@ -75,7 +76,7 @@ Either to add or remove a class of statistics, or to change the set of features
"histograms": False,
"correlations": False,
"exact_uniqueness": False,
- "columns": ['location_id', 'min_temp', 'max_temp']
+ "columns": ["location_id", "min_temp", "max_temp"],
}
fg.update_statistics_config()
@@ -98,7 +99,7 @@ As external feature groups are read only from an Hopsworks perspective, statisti
=== "Python"
```python
- fg.compute_statistics(wallclock_time='20220611 20:00')
+ fg.compute_statistics(wallclock_time="20220611 20:00")
```
## Inspect statistics
diff --git a/docs/user_guides/fs/feature_group/ttl.md b/docs/user_guides/fs/feature_group/ttl.md
index f673605e3..5a8b7e55d 100644
--- a/docs/user_guides/fs/feature_group/ttl.md
+++ b/docs/user_guides/fs/feature_group/ttl.md
@@ -27,6 +27,7 @@ Data rows where `event_time` is older than the TTL period will be automatically
```python
from datetime import datetime, timezone
+
import pandas as pd
# Assume you already have a feature store handle
diff --git a/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md b/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md
index d820d1ef8..c29d2be7a 100644
--- a/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md
+++ b/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md
@@ -15,7 +15,7 @@ You can retrieve one or more feature monitoring configurations from the Feature
=== "Python"
- ```python3
+ ```python
# retrieve all configurations
fm_configs = trans_fg.get_feature_monitoring_configs() # from a feature group
fm_configs = trans_fv.get_feature_monitoring_configs() # or a feature view
@@ -52,7 +52,7 @@ You can easily enable or disable a specific feature monitoring configuration usi
=== "Python"
- ```python3
+ ```python
# disable a specific feature monitoring configuration
fm_config.disable()
@@ -79,7 +79,7 @@ To trigger the feature monitoring job once from the Python API, use the feature
=== "Python"
- ```python3
+ ```python
# run the feature monitoring job once
fm_config.run_job()
```
@@ -99,7 +99,7 @@ Alternatively, you can retrieve all the statistics and comparison results using
=== "Python"
- ```python3
+ ```python
# retrieve all feature monitoring results from a specific config
fm_results = fm_config.get_history()
@@ -120,6 +120,6 @@ You can delete feature monitoring configurations using the Python API only, as s
=== "Python"
- ```python3
+ ```python
fm_config.delete()
```
diff --git a/docs/user_guides/fs/feature_monitoring/index.md b/docs/user_guides/fs/feature_monitoring/index.md
index 1916ec18e..698293831 100644
--- a/docs/user_guides/fs/feature_monitoring/index.md
+++ b/docs/user_guides/fs/feature_monitoring/index.md
@@ -17,7 +17,6 @@ Hopsworks feature monitoring user interface is centered around two functionaliti
!!! important
To enable feature monitoring in Hopsworks, you need to set the `enable_feature_monitoring` [configuration option](../../../setup_installation/admin/variables.md) to `true`.
This can also be achieved in the cluster definition by setting the following attribute:
-
```
hopsworks:
enable_feature_monitoring: "true"
diff --git a/docs/user_guides/fs/feature_view/batch-data.md b/docs/user_guides/fs/feature_view/batch-data.md
index 8c57cfa53..ac20b86a2 100644
--- a/docs/user_guides/fs/feature_view/batch-data.md
+++ b/docs/user_guides/fs/feature_view/batch-data.md
@@ -11,9 +11,8 @@ The resultant DataFrame (or batch-scoring DataFrame) can then be fed to models t
```python
# get batch data
df = feature_view.get_batch_data(
- start_time = "20220620",
- end_time = "20220627"
- ) # return a dataframe
+ start_time="20220620", end_time="20220627"
+ ) # return a dataframe
```
=== "Java"
@@ -34,11 +33,11 @@ To retrieve the primary key(s) and/or event time when retrieving batch data for
```python
# get batch data
df = feature_view.get_batch_data(
- start_time = "20220620",
- end_time = "20220627",
- primary_key=True,
- event_time=True
- ) # return a dataframe with primary keys and event time
+ start_time="20220620",
+ end_time="20220627",
+ primary_key=True,
+ event_time=True,
+ ) # return a dataframe with primary keys and event time
```
!!! note
All primary and event time columns of all the feature groups included in the feature view will be returned.
@@ -51,9 +50,7 @@ If the service is enabled, and you want to read this particular batch data with
```python
# get batch data with Hive
df = feature_view.get_batch_data(
- start_time = "20220620",
- end_time = "20220627",
- read_options={"use_hive": True}
+ start_time="20220620", end_time="20220627", read_options={"use_hive": True}
)
```
@@ -92,5 +89,7 @@ After [defining a transformation function using a context variable](../transform
```python
# Passing context variable to IN-MEMORY Training Dataset.
- batch_data = feature_view.get_batch_data(transformation_context={"context_parameter":10})
+ batch_data = feature_view.get_batch_data(
+ transformation_context={"context_parameter": 10}
+ )
```
diff --git a/docs/user_guides/fs/feature_view/feature-vectors.md b/docs/user_guides/fs/feature_view/feature-vectors.md
index a25bbb958..c16778a53 100644
--- a/docs/user_guides/fs/feature_view/feature-vectors.md
+++ b/docs/user_guides/fs/feature_view/feature-vectors.md
@@ -23,17 +23,11 @@ It is also possible to provide a subset of the entry, which will be discussed [b
```python
# get a single vector
- feature_view.get_feature_vector(
- entry = {"pk1": 1, "pk2": 2}
- )
+ feature_view.get_feature_vector(entry={"pk1": 1, "pk2": 2})
# get multiple vectors
feature_view.get_feature_vectors(
- entry = [
- {"pk1": 1, "pk2": 2},
- {"pk1": 3, "pk2": 4},
- {"pk1": 5, "pk2": 6}
- ]
+ entry=[{"pk1": 1, "pk2": 2}, {"pk1": 3, "pk2": 4}, {"pk1": 5, "pk2": 6}]
)
```
@@ -95,9 +89,7 @@ Take the above example assuming the feature view consists of two joined feature
```python
# get a single vector
- feature_view.get_feature_vector(
- entry = {"pk1": 1, "pk2": 2}
- )
+ feature_view.get_feature_vector(entry={"pk1": 1, "pk2": 2})
```
=== "Java"
@@ -119,11 +111,7 @@ When retrieving a batch of vectors, the behaviour is slightly different.
```python
# get multiple vectors
feature_view.get_feature_vectors(
- entry = [
- {"pk1": 1, "pk2": 2},
- {"pk1": 3, "pk2": 4},
- {"pk1": 5, "pk2": 6}
- ]
+ entry=[{"pk1": 1, "pk2": 2}, {"pk1": 3, "pk2": 4}, {"pk1": 5, "pk2": 6}]
)
```
@@ -156,18 +144,15 @@ If `pk2` is not provided, this returns feature values from the first feature gro
```python
# get a single vector with
- feature_view.get_feature_vector(
- entry = {"pk1": 1},
- allow_missing=True
- )
+ feature_view.get_feature_vector(entry={"pk1": 1}, allow_missing=True)
# get multiple vectors
feature_view.get_feature_vectors(
- entry = [
+ entry=[
{"pk1": 1},
{"pk1": 3},
],
- allow_missing=True
+ allow_missing=True,
)
```
@@ -199,22 +184,17 @@ Please note that passed features is only available in the python client but not
```python
# get a single vector
feature_view.get_feature_vector(
- entry = {"pk1": 1, "pk2": 2},
- passed_features = {"feature_a": "value_a"}
+ entry={"pk1": 1, "pk2": 2}, passed_features={"feature_a": "value_a"}
)
# get multiple vectors
feature_view.get_feature_vectors(
- entry = [
- {"pk1": 1, "pk2": 2},
- {"pk1": 3, "pk2": 4},
- {"pk1": 5, "pk2": 6}
- ],
- passed_features = [
+ entry=[{"pk1": 1, "pk2": 2}, {"pk1": 3, "pk2": 4}, {"pk1": 5, "pk2": 6}],
+ passed_features=[
{"feature_a": "value_a1"},
{"feature_a": "value_a2"},
{"feature_a": "value_a3"},
- ]
+ ],
)
```
@@ -230,12 +210,12 @@ In this second case, you do not have to provide the primary key value for that f
# in this case feature_b and feature_c
feature_view.get_feature_vector(
- entry = { "pk1": 1 },
- passed_features = {
+ entry={"pk1": 1},
+ passed_features={
"feature_a": "value_a",
"feature_b": "value_b",
- "feature_c": "value_c"
- }
+ "feature_c": "value_c",
+ },
)
```
@@ -258,6 +238,8 @@ However, you can retrieve the untransformed feature vectors without applying mod
untransformed_feature_vectors = feature_view.get_feature_vectors(
entry=[{"id": 1}, {"id": 2}], transform=False
)
+
+
```
## Retrieving feature vector without on-demand features
@@ -275,6 +257,8 @@ To achieve this, set the parameters `transform` and `on_demand_features` to `Fa
untransformed_feature_vectors = feature_view.get_feature_vectors(
entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False
)
+
+
```
## Passing Context Variables to Transformation Functions
@@ -287,9 +271,10 @@ After [defining a transformation function using a context variable](../transform
```python
# Passing context variable to IN-MEMORY Training Dataset.
batch_data = feature_view.get_feature_vectors(
- entry = [{ "pk1": 1 }],
- transformation_context={"context_parameter":10}
+ entry=[{"pk1": 1}], transformation_context={"context_parameter": 10}
)
+
+
```
## Choose the right Client
@@ -318,7 +303,7 @@ my_feature_view.init_serving(
init_rest_client=True,
config_rest_client={
"api_key": "your_api_key",
- }
+ },
)
```
@@ -335,19 +320,18 @@ my_feature_view.init_serving(
config_rest_client={
"api_key": "your_api_key",
},
- default_client="rest"
+ default_client="rest",
)
# this will fetch a feature vector via REST
try:
my_feature_view.get_feature_vector(
- entry = {"pk1": 1, "pk2": 2},
+ entry={"pk1": 1, "pk2": 2},
)
except TimeoutException:
# if the REST client times out, the SQL client will be used
my_feature_view.get_feature_vector(
- entry = {"pk1": 1, "pk2": 2},
- force_sql=True
+ entry={"pk1": 1, "pk2": 2}, force_sql=True
)
```
diff --git a/docs/user_guides/fs/feature_view/feature_logging.md b/docs/user_guides/fs/feature_view/feature_logging.md
index ac3d804e5..063e44107 100644
--- a/docs/user_guides/fs/feature_view/feature_logging.md
+++ b/docs/user_guides/fs/feature_view/feature_logging.md
@@ -46,10 +46,9 @@ You have a DataFrame of features you want to log.
```python
import pandas as pd
-features = pd.DataFrame({
- "feature1": [1.1, 2.2, 3.3],
- "feature2": [4.4, 5.5, 6.6]
-})
+features = pd.DataFrame(
+ {"feature1": [1.1, 2.2, 3.3], "feature2": [4.4, 5.5, 6.6]}
+)
# Log features
feature_view.log(features)
@@ -60,15 +59,14 @@ feature_view.log(features)
You can also log predictions, and optionally the training dataset and the model used for prediction.
```python
-predictions = pd.DataFrame({
- "prediction": [0, 1, 0]
-})
+predictions = pd.DataFrame({"prediction": [0, 1, 0]})
# Log features and predictions
-feature_view.log(features,
- predictions=predictions,
- training_dataset_version=1,
- model=Model(1, "model", version=1)
+feature_view.log(
+ features,
+ predictions=predictions,
+ training_dataset_version=1,
+ model=Model(1, "model", version=1),
)
```
@@ -137,7 +135,9 @@ Accepted date format are: `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d
```python
# Read log entries from January 2022
-log_entries = feature_view.read_log(start_time="2022-01-01", end_time="2022-01-31")
+log_entries = feature_view.read_log(
+ start_time="2022-01-01", end_time="2022-01-31"
+)
print(log_entries)
```
diff --git a/docs/user_guides/fs/feature_view/feature_monitoring.md b/docs/user_guides/fs/feature_view/feature_monitoring.md
index 674d228f2..082ad0118 100644
--- a/docs/user_guides/fs/feature_view/feature_monitoring.md
+++ b/docs/user_guides/fs/feature_view/feature_monitoring.md
@@ -40,7 +40,7 @@ Connect the client running your notebooks to Hopsworks.
=== "Python"
- ```python3
+ ```python
import hopsworks
project = hopsworks.login()
@@ -61,7 +61,7 @@ The following is a code example for getting or creating a Feature View with name
=== "Python"
- ```python3
+ ```python
# Retrieve an existing feature view
trans_fv = fs.get_feature_view("trans_fv", version=1)
@@ -81,13 +81,13 @@ The following is a code example for creating a training dataset with two splits
=== "Python"
- ```python3
+ ```python
# Create a training dataset with train and test splits
_, _ = trans_fv.create_train_validation_test_split(
- description = 'transactions fraud batch training dataset',
- data_format = 'csv',
- validation_size = 0.2,
- test_size = 0.1,
+ description="transactions fraud batch training dataset",
+ data_format="csv",
+ validation_size=0.2,
+ test_size=0.1,
)
```
@@ -99,7 +99,7 @@ You can setup statistics monitoring on a ==single feature or multiple features==
=== "Python"
- ```python3
+ ```python
# compute statistics for all the features
fg_monitoring_config = trans_fv.create_statistics_monitoring(
name="trans_fv_all_features_monitoring",
@@ -121,7 +121,7 @@ You can create multiple feature monitoring configurations on the same Feature Vi
=== "Python"
- ```python3
+ ```python
fg_monitoring_config = trans_fv.create_feature_monitoring(
name="trans_fv_amount_monitoring",
feature_name="amount",
@@ -136,12 +136,12 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d
=== "Python"
- ```python3
+ ```python
fg_monitoring_config = trans_fv.create_statistics_monitoring(
name="trans_fv_all_features_monitoring",
description="Compute statistics on all data of all features of the Feature Group data on a weekly basis",
cron_expression="0 0 12 ? *MON*", # weekly
- row_percentage=0.8, # use 80% of the data
+ row_percentage=0.8, # use 80% of the data
)
# or
@@ -150,7 +150,7 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d
feature_name="amount",
description="Compute descriptive statistics on the amount Feature of the Feature Group data on a weekly basis",
cron_expression="0 0 12 ? * MON *", # weekly
- row_percentage=0.8, # use 80% of the data
+ row_percentage=0.8, # use 80% of the data
)
```
@@ -162,10 +162,10 @@ Additionally, you can specify the percentage of feature data on which statistics
=== "Python"
- ```python3
+ ```python
fm_monitoring_config.with_detection_window(
window_length="1w", # data ingested during one week
- time_offset="1w", # starting from last week
+ time_offset="1w", # starting from last week
row_percentage=0.8, # use 80% of the data
)
```
@@ -176,11 +176,11 @@ When setting up feature monitoring for a Feature View, reference windows can be
=== "Python"
- ```python3
+ ```python
# compare statistics against a reference window
fm_monitoring_config.with_reference_window(
window_length="1w", # data ingested during one week
- time_offset="2w", # starting from two weeks ago
+ time_offset="2w", # starting from two weeks ago
row_percentage=0.8, # use 80% of the data
)
@@ -191,7 +191,7 @@ When setting up feature monitoring for a Feature View, reference windows can be
# or a training dataset
fm_monitoring_config.with_reference_training_dataset(
- training_dataset_version=1, # use the training dataset used to train your production model
+ training_dataset_version=1, # use the training dataset used to train your production model
)
```
@@ -203,12 +203,12 @@ Then, you can define a relative or absolute threshold using the `threshold` and
=== "Python"
- ```python3
+ ```python
fm_monitoring_config.compare_on(
metric="mean",
threshold=0.2, # a relative change over 20% is considered anomalous
relative=True, # relative or absolute change
- strict=False, # strict or relaxed comparison
+ strict=False, # strict or relaxed comparison
)
```
@@ -222,7 +222,7 @@ Once the configuration is saved, the schedule for the statistics computation and
=== "Python"
- ```python3
+ ```python
fm_monitoring_config.save()
```
diff --git a/docs/user_guides/fs/feature_view/helper-columns.md b/docs/user_guides/fs/feature_view/helper-columns.md
index 8e7497cba..cbb22f305 100644
--- a/docs/user_guides/fs/feature_view/helper-columns.md
+++ b/docs/user_guides/fs/feature_view/helper-columns.md
@@ -27,18 +27,21 @@ for computing the [on-demand feature](../../../concepts/fs/feature_group/on_dema
```python
# define query object
- query = label_fg.select("fraud_label")\
- .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"]))
+ query = label_fg.select("fraud_label").join(
+ trans_fg.select(["amount", "days_valid", "expiry_date", "category"])
+ )
# define feature view with helper columns
feature_view = fs.get_or_create_feature_view(
- name='fv_with_helper_col',
+ name="fv_with_helper_col",
version=1,
query=query,
labels=["fraud_label"],
transformation_functions=transformation_functions,
inference_helper_columns=["expiry_date"],
)
+
+
```
### Inference Data Retrieval
@@ -52,24 +55,40 @@ However, they can be optionally fetched with inference or training data.
=== "Python"
```python
-
# import feature functions
from feature_functions import time_delta
# Fetch feature view object
feature_view = fs.get_feature_view(
- name='fv_with_helper_col',
+ name="fv_with_helper_col",
version=1,
)
# Fetch feature data for batch inference with helper columns
- df = feature_view.get_batch_data(start_time=start_time, end_time=end_time, inference_helpers=True, event_time=True)
+ df = feature_view.get_batch_data(
+ start_time=start_time,
+ end_time=end_time,
+ inference_helpers=True,
+ event_time=True,
+ )
# compute location delta
- df['days_valid'] = df.apply(lambda row: time_delta(row['expiry_date'], row['transaction_date']), axis=1)
+ df["days_valid"] = df.apply(
+ lambda row: time_delta(row["expiry_date"], row["transaction_date"]), axis=1
+ )
# prepare datatame for prediction
- df = df[[f.name for f in feature_view.features if not (f.label or f.inference_helper_column or f.training_helper_column)]]
+ df = df[
+ [
+ f.name
+ for f in feature_view.features
+ if not (
+ f.label or f.inference_helper_column or f.training_helper_column
+ )
+ ]
+ ]
+
+
```
#### Online inference
@@ -82,7 +101,7 @@ However, they can be optionally fetched with inference or training data.
# Fetch feature view object
feature_view = fs.get_feature_view(
- name='fv_with_helper_col',
+ name="fv_with_helper_col",
version=1,
)
@@ -97,16 +116,20 @@ However, they can be optionally fetched with inference or training data.
transaction_date = ...
# get previous transaction location of this credit card
- inference_helper = feature_view.get_inference_helper({"cc_num": cc_num}, return_type="dict")
+ inference_helper = feature_view.get_inference_helper(
+ {"cc_num": cc_num}, return_type="dict"
+ )
# compute location delta
- days_valid = time_delta(transaction_date, inference_helper['expiry_date'])
+ days_valid = time_delta(transaction_date, inference_helper["expiry_date"])
# Now get assembled feature vector for prediction
feature_vector = feature_view.get_feature_vector(
{"cc_num": cc_num},
passed_features={"days_valid": days_valid},
)
+
+
```
## Training Helper columns
@@ -119,18 +142,21 @@ For example one might want to use feature like `category` of the purchased produ
```python
# define query object
- query = label_fg.select("fraud_label")\
- .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"]))
+ query = label_fg.select("fraud_label").join(
+ trans_fg.select(["amount", "days_valid", "expiry_date", "category"])
+ )
# define feature view with helper columns
feature_view = fs.get_or_create_feature_view(
- name='fv_with_helper_col',
+ name="fv_with_helper_col",
version=1,
query=query,
labels=["fraud_label"],
transformation_functions=transformation_functions,
- training_helper_columns=["category"]
+ training_helper_columns=["category"],
)
+
+
```
### Training Data Retrieval
@@ -147,23 +173,24 @@ However, they can be optionally fetched.
# Fetch feature view object
feature_view = fs.get_feature_view(
- name='fv_with_helper_col',
+ name="fv_with_helper_col",
version=1,
)
# Create and training data with training helper columns
TEST_SIZE = 0.2
X_train, X_test, y_train, y_test = feature_view.train_test_split(
- description='transactions fraud training dataset',
+ description="transactions fraud training dataset",
test_size=TEST_SIZE,
- training_helper_columns=True
+ training_helper_columns=True,
)
# Get existing training data with training helper columns
X_train, X_test, y_train, y_test = feature_view.get_train_test_split(
- training_dataset_version=1,
- training_helper_columns=True
+ training_dataset_version=1, training_helper_columns=True
)
+
+
```
!!! note
diff --git a/docs/user_guides/fs/feature_view/model-dependent-transformations.md b/docs/user_guides/fs/feature_view/model-dependent-transformations.md
index 8e911dd47..66ebfe518 100644
--- a/docs/user_guides/fs/feature_view/model-dependent-transformations.md
+++ b/docs/user_guides/fs/feature_view/model-dependent-transformations.md
@@ -32,23 +32,30 @@ Additionally, Hopsworks also allows users to specify custom names for transforme
# Defining a many to many transformation function.
@udf(return_type=[int, int, int], drop=["feature1", "feature3"])
def add_one_multiple(feature1, feature2, feature3):
- return pd.DataFrame({"add_one_feature1":feature1 + 1, "add_one_feature2":feature2 + 1, "add_one_feature3":feature3 + 1})
+ return pd.DataFrame(
+ {
+ "add_one_feature1": feature1 + 1,
+ "add_one_feature2": feature2 + 1,
+ "add_one_feature3": feature3 + 1,
+ }
+ )
+
# Defining a one to one transformation function.
@udf(return_type=int)
def add_two(feature):
return feature + 2
+
# Creating model-dependent transformations by attaching transformation functions to feature views.
feature_view = fs.create_feature_view(
- name='transactions_view',
+ name="transactions_view",
query=query,
labels=["fraud_label"],
- transformation_functions=[
- add_two,
- add_one_multiple
- ]
+ transformation_functions=[add_two, add_one_multiple],
)
+
+
```
### Specifying input features
@@ -60,15 +67,17 @@ The features to be used by a model-dependent transformation function can be spec
```python
feature_view = fs.create_feature_view(
- name='transactions_view',
+ name="transactions_view",
query=query,
labels=["fraud_label"],
transformation_functions=[
add_two("feature_1"),
add_two("feature_2"),
- add_one_multiple("feature_5", "feature_6", "feature_7")
- ]
+ add_one_multiple("feature_5", "feature_6", "feature_7"),
+ ],
)
+
+
```
### Using built-in transformations
@@ -86,16 +95,18 @@ The only difference is that they can either be retrieved from the Hopsworks or i
label_encoder = fs.get_transformation_function(name="label_encoder")
feature_view = fs.create_feature_view(
- name='transactions_view',
+ name="transactions_view",
query=query,
labels=["fraud_label"],
- transformation_functions = [
+ transformation_functions=[
label_encoder("category"),
robust_scaler("amount"),
min_max_scaler("loc_delta"),
- standard_scaler("age_at_transaction")
- ]
+ standard_scaler("age_at_transaction"),
+ ],
)
+
+
```
To attach built-in transformation functions from the `hopsworks` module they can be directly imported into the code from `hopsworks.builtin_transformations`.
@@ -104,19 +115,26 @@ To attach built-in transformation functions from the `hopsworks` module they can
=== "Python"
```python
- from hopsworks.hsfs.builtin_transformations import min_max_scaler, label_encoder, robust_scaler, standard_scaler
+ from hopsworks.hsfs.builtin_transformations import (
+ label_encoder,
+ min_max_scaler,
+ robust_scaler,
+ standard_scaler,
+ )
feature_view = fs.create_feature_view(
- name='transactions_view',
+ name="transactions_view",
query=query,
labels=["fraud_label"],
- transformation_functions = [
+ transformation_functions=[
label_encoder("category"),
robust_scaler("amount"),
min_max_scaler("loc_delta"),
- standard_scaler("age_at_transaction")
- ]
+ standard_scaler("age_at_transaction"),
+ ],
)
+
+
```
## Using Model Dependent Transformations
@@ -135,10 +153,14 @@ Model-dependent transformation functions can also be manually applied to a featu
fv.init_serving(training_dataset_version)
# Get untransformed feature Vector
- feature_vector = fv.get_feature_vector(entry={"index":10}, transform=False, return_type="pandas")
+ feature_vector = fv.get_feature_vector(
+ entry={"index": 10}, transform=False, return_type="pandas"
+ )
# Apply Model Dependent transformations
encoded_feature_vector = fv.transform(feature_vector)
+
+
```
### Retrieving untransformed feature vector and batch inference data
@@ -161,7 +183,7 @@ To achieve this, set the `transform` parameter to False.
)
# Fetching untransformed batch data.
- untransformed_batch_data = feature_view.get_batch_data(
- transform=False
- )
+ untransformed_batch_data = feature_view.get_batch_data(transform=False)
+
+
```
diff --git a/docs/user_guides/fs/feature_view/overview.md b/docs/user_guides/fs/feature_view/overview.md
index 0521b587f..9973f3ad9 100644
--- a/docs/user_guides/fs/feature_view/overview.md
+++ b/docs/user_guides/fs/feature_view/overview.md
@@ -20,19 +20,18 @@ For example, when a client reads a numerical feature, the feature value could be
```python
# create a simple feature view
- feature_view = fs.create_feature_view(
- name='transactions_view',
- query=query
- )
+ feature_view = fs.create_feature_view(name="transactions_view", query=query)
# create a feature view with transformation and label
feature_view = fs.create_feature_view(
- name='transactions_view',
+ name="transactions_view",
query=query,
labels=["fraud_label"],
transformation_functions={
- "amount": fs.get_transformation_function(name="standard_scaler", version=1)
- }
+ "amount": fs.get_transformation_function(
+ name="standard_scaler", version=1
+ )
+ },
)
```
@@ -104,7 +103,7 @@ You can learn more in [Tags Guide](../tags/tags.md).
# get
feature_view.get_tag(name="tag_schema")
- #remove
+ # remove
feature_view.delete_tag(name="tag_schema")
```
diff --git a/docs/user_guides/fs/feature_view/query.md b/docs/user_guides/fs/feature_view/query.md
index 6a2955819..8ea028925 100644
--- a/docs/user_guides/fs/feature_view/query.md
+++ b/docs/user_guides/fs/feature_view/query.md
@@ -77,7 +77,9 @@ Selecting features from a feature group is a lazy operation, returning a query w
credit_card_transactions_fg = fs.get_feature_group("credit_card_transactions")
# Returns Query
- selected_features = credit_card_transactions_fg.select(["amount", "latitude", "longitude"])
+ selected_features = credit_card_transactions_fg.select(
+ ["amount", "latitude", "longitude"]
+ )
```
=== "Scala"
@@ -118,9 +120,16 @@ The join key lists should contain the names of the features to join on.
=== "Python"
```python
- selected_features = credit_card_transactions_fg.select_all() \
- .join(account_details_fg.select_all(), on=["cc_num"]) \
- .join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"], join_type="inner")
+ selected_features = (
+ credit_card_transactions_fg.select_all()
+ .join(account_details_fg.select_all(), on=["cc_num"])
+ .join(
+ merchant_details_fg.select_all(),
+ left_on=["merchant_id"],
+ right_on=["id"],
+ join_type="inner",
+ )
+ )
```
=== "Scala"
@@ -222,7 +231,9 @@ For the Scala part of the API, equivalent methods are available in the `Feature`
=== "Python"
```python
- filtered_credit_card_transactions = credit_card_transactions_fg.filter(credit_card_transactions_fg.category == "Grocery")
+ filtered_credit_card_transactions = credit_card_transactions_fg.filter(
+ credit_card_transactions_fg.category == "Grocery"
+ )
```
=== "Scala"
@@ -236,10 +247,19 @@ Filters are fully compatible with joins:
=== "Python"
```python
- selected_features = credit_card_transactions_fg.select_all() \
- .join(account_details_fg.select_all(), on=["cc_num"]) \
- .join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"]) \
- .filter((credit_card_transactions_fg.category == "Grocery") | (credit_card_transactions_fg.category == "Restaurant/Cafeteria"))
+ selected_features = (
+ credit_card_transactions_fg.select_all()
+ .join(account_details_fg.select_all(), on=["cc_num"])
+ .join(
+ merchant_details_fg.select_all(),
+ left_on=["merchant_id"],
+ right_on=["id"],
+ )
+ .filter(
+ (credit_card_transactions_fg.category == "Grocery")
+ | (credit_card_transactions_fg.category == "Restaurant/Cafeteria")
+ )
+ )
```
=== "Scala"
@@ -256,10 +276,21 @@ The filters can be applied at any point of the query:
=== "Python"
```python
- selected_features = credit_card_transactions_fg.select_all() \
- .join(accountDetails_fg.select_all().filter(accountDetails_fg.avg_temp >= 22), on=["cc_num"]) \
- .join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"]) \
+ selected_features = (
+ credit_card_transactions_fg.select_all()
+ .join(
+ accountDetails_fg.select_all().filter(
+ accountDetails_fg.avg_temp >= 22
+ ),
+ on=["cc_num"],
+ )
+ .join(
+ merchant_details_fg.select_all(),
+ left_on=["merchant_id"],
+ right_on=["id"],
+ )
.filter(credit_card_transactions_fg.category == "Grocery")
+ )
```
=== "Scala"
diff --git a/docs/user_guides/fs/feature_view/spine-query.md b/docs/user_guides/fs/feature_view/spine-query.md
index f156dd77f..d2273b539 100644
--- a/docs/user_guides/fs/feature_view/spine-query.md
+++ b/docs/user_guides/fs/feature_view/spine-query.md
@@ -21,8 +21,9 @@ The first step before creating a Feature View, is to construct the query by sele
```python
# Select features for training data.
-ds_query = trans_fg.select(["fraud_label"])\
- .join(window_aggs_fg.select_except(["cc_num"]), on="cc_num")
+ds_query = trans_fg.select(["fraud_label"]).join(
+ window_aggs_fg.select_except(["cc_num"]), on="cc_num"
+)
ds_query.show(5)
```
@@ -39,14 +40,15 @@ trans_spine = fs.get_or_create_spine_group(
name="spine_transactions",
version=1,
description="Transaction data",
- primary_key=['cc_num'],
- event_time='datetime',
- dataframe=trans_df
+ primary_key=["cc_num"],
+ event_time="datetime",
+ dataframe=trans_df,
)
# Select features for training data.
-ds_query_spine = trans_spine.select(["fraud_label"])\
- .join(window_aggs_fg.select_except(["cc_num"]), on="cc_num")
+ds_query_spine = trans_spine.select(["fraud_label"]).join(
+ window_aggs_fg.select_except(["cc_num"]), on="cc_num"
+)
```
Calling the `show()` or `read()` method of this query object will use the spine dataframe included in the Spine Group object to perform the join.
@@ -61,7 +63,7 @@ With the above defined query, we can continue to create the Feature View in the
```python
feature_view_spine = fs.get_or_create_feature_view(
- name='transactions_view_spine',
+ name="transactions_view_spine",
query=ds_query_spine,
version=1,
labels=["fraud_label"],
@@ -74,7 +76,9 @@ With the regular feature view, the labels are fetched from the feature store, bu
Here you have the chance to pass a different set of entities to generate the training dataset.
```python
-X_train, X_test, y_train, y_test = feature_view_spine.train_test_split(0.2, spine=new_entities_df)
+X_train, X_test, y_train, y_test = feature_view_spine.train_test_split(
+ 0.2, spine=new_entities_df
+)
X_train.show()
```
diff --git a/docs/user_guides/fs/feature_view/training-data.md b/docs/user_guides/fs/feature_view/training-data.md
index e8a0d46ad..d57d45260 100644
--- a/docs/user_guides/fs/feature_view/training-data.md
+++ b/docs/user_guides/fs/feature_view/training-data.md
@@ -19,16 +19,16 @@ You can monitor the job status in the [jobs overview UI](../../projects/jobs/pys
```python
# create a training dataset as dataframe
feature_df, label_df = feature_view.training_data(
- description = 'transactions fraud batch training dataset',
+ description="transactions fraud batch training dataset",
)
# materialise a training dataset
version, job = feature_view.create_training_data(
- description = 'transactions fraud batch training dataset',
- data_format = 'csv',
- write_options = {"wait_for_job": False}
-) # By default, it is materialised to HopsFS
-print(job.id) # get the job's id and view the job status in the UI
+ description="transactions fraud batch training dataset",
+ data_format="csv",
+ write_options={"wait_for_job": False},
+) # By default, it is materialised to HopsFS
+print(job.id) # get the job's id and view the job status in the UI
```
### Extra filters
@@ -44,13 +44,14 @@ Examples below show how to create training data for different transaction catego
```python
# Create a training dataset for Health/Beauty
df_health = feature_view.training_data(
- description = 'transactions fraud batch training dataset for Health/Beauty',
- extra_filter = trans_fg.category == "Health/Beauty"
+ description="transactions fraud batch training dataset for Health/Beauty",
+ extra_filter=trans_fg.category == "Health/Beauty",
)
# Create a training dataset for Restaurant/Cafeteria and Holliday/Travel
df_restaurant_travel = feature_view.training_data(
- description = 'transactions fraud batch training dataset for Restaurant/Cafeteria and Holliday/Travel',
- extra_filter = trans_fg.category == "Restaurant/Cafeteria" and trans_fg.category == "Holliday/Travel"
+ description="transactions fraud batch training dataset for Restaurant/Cafeteria and Holliday/Travel",
+ extra_filter=trans_fg.category == "Restaurant/Cafeteria"
+ and trans_fg.category == "Holliday/Travel",
)
```
@@ -67,9 +68,9 @@ X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2)
# materialise a training dataset
version, job = feature_view.create_train_test_split(
- test_size = 0.2,
- description = 'transactions fraud batch training dataset',
- data_format = 'csv'
+ test_size=0.2,
+ description="transactions fraud batch training dataset",
+ data_format="csv",
)
```
@@ -77,14 +78,18 @@ Create a training dataset (as in-memory DataFrames) or materialise a training da
```python
# create a training dataset as DataFrame
-X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(validation_size=0.3, test_size=0.2)
+X_train, X_val, X_test, y_train, y_val, y_test = (
+ feature_view.train_validation_test_split(
+ validation_size=0.3, test_size=0.2
+ )
+)
# materialise a training dataset
version, job = feature_view.create_train_validation_test_split(
- validation_size = 0.3,
- test_size = 0.2,
- description = 'transactions fraud batch training dataset',
- data_format = 'csv'
+ validation_size=0.3,
+ test_size=0.2,
+ description="transactions fraud batch training dataset",
+ data_format="csv",
)
```
@@ -93,7 +98,9 @@ and you want to create a particular in-memory training dataset with Hive instead
```python
# create a training dataset as DataFrame with Hive
-X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2, read_options={"use_hive": True})
+X_train, X_test, y_train, y_test = feature_view.train_test_split(
+ test_size=0.2, read_options={"use_hive": True}
+)
```
## Read Training Data
@@ -105,13 +112,19 @@ That is, you can delete the training data files (for example, to reduce storage
```python
# get a training dataset
-feature_df, label_df = feature_view.get_training_data(training_dataset_version=1)
+feature_df, label_df = feature_view.get_training_data(
+ training_dataset_version=1
+)
# get a training dataset with train and test splits
-X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1)
+X_train, X_test, y_train, y_test = feature_view.get_train_test_split(
+ training_dataset_version=1
+)
# get a training dataset with train, validation and test splits
-X_train, X_val, X_test, y_train, y_val, y_test = feature_view.get_train_validation_test_split(training_dataset_version=1)
+X_train, X_val, X_test, y_train, y_val, y_test = (
+ feature_view.get_train_validation_test_split(training_dataset_version=1)
+)
```
## Passing Context Variables to Transformation Functions
@@ -130,7 +143,7 @@ Once you have [defined a transformation function using a context variable](../tr
training_dataset_version=1,
primary_key=True,
event_time=True,
- transformation_context={"context_parameter":10},
+ transformation_context={"context_parameter": 10},
)
# Passing context variable to Materialized Training Dataset.
@@ -138,8 +151,10 @@ Once you have [defined a transformation function using a context variable](../tr
training_dataset_version=1,
primary_key=True,
event_time=True,
- transformation_context={"context_parameter":10},
+ transformation_context={"context_parameter": 10},
)
+
+
```
## Read training data with primary key(s) and event time
@@ -192,7 +207,7 @@ feature_view.purge_all_training_data()
To recreate a training dataset:
```python
-feature_view.recreate_training_dataset(training_dataset_version =1)
+feature_view.recreate_training_dataset(training_dataset_version=1)
```
## Tags
@@ -203,16 +218,18 @@ You can learn more in [Tags Guide](../tags/tags.md).
```python
# attach
feature_view.add_training_dataset_tag(
- training_dataset_version=1,
- name="tag_schema",
- value={"key": "value"}
+ training_dataset_version=1, name="tag_schema", value={"key": "value"}
)
# get
-feature_view.get_training_dataset_tag(training_dataset_version=1, name="tag_schema")
+feature_view.get_training_dataset_tag(
+ training_dataset_version=1, name="tag_schema"
+)
-#remove
-feature_view.delete_training_dataset_tag(training_dataset_version=1, name="tag_schema")
+# remove
+feature_view.delete_training_dataset_tag(
+ training_dataset_version=1, name="tag_schema"
+)
```
## Next
diff --git a/docs/user_guides/fs/provenance/provenance.md b/docs/user_guides/fs/provenance/provenance.md
index 03626c8e0..5584db369 100644
--- a/docs/user_guides/fs/provenance/provenance.md
+++ b/docs/user_guides/fs/provenance/provenance.md
@@ -40,9 +40,7 @@ You can inspect the relationship between data sources and feature groups using t
# Create the user profiles feature group
user_profiles_fg = fs.create_external_feature_group(
- name="user_profiles",
- version=1,
- data_source=ds
+ name="user_profiles", version=1, data_source=ds
)
user_profiles_fg.save()
```
@@ -114,7 +112,9 @@ You can mark the external feature group as parent of the feature group you are c
# Do feature engineering
age_df = transaction_df.merge(profiles_fg.read(), on="cc_num", how="left")
- transaction_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y")
+ transaction_df["age_at_transaction"] = (
+ age_df["datetime"] - age_df["birthdate"]
+ ) / np.timedelta64(1, "Y")
# Create the transaction feature group
transaction_fg = fs.get_or_create_feature_group(
@@ -123,7 +123,7 @@ You can mark the external feature group as parent of the feature group you are c
description="Transaction features",
primary_key=["cc_num"],
event_time="datetime",
- parents=[profiles_fg]
+ parents=[profiles_fg],
)
transaction_fg.insert(transaction_df)
```
@@ -138,10 +138,12 @@ Another example use case for derived feature group is if you have a feature grou
daily_transaction_df = daily_transaction_fg.read()
# Do feature engineering
- cc_group = daily_transaction_df[["cc_num", "amount", "datetime"]] \
- .groupby("cc_num") \
- .rolling("1M", on="datetime")
- monthly_transaction_df = pd.DataFrame(cc_group.mean())
+ cc_group = (
+ daily_transaction_df[["cc_num", "amount", "datetime"]]
+ .groupby("cc_num")
+ .rolling("1M", on="datetime")
+ )
+ monthly_transaction_df = pd.DataFrame(cc_group.mean())
# Create the transaction feature group
monthly_transaction_fg = fs.get_or_create_feature_group(
@@ -150,7 +152,7 @@ Another example use case for derived feature group is if you have a feature grou
description="Transaction features - monthly aggregates",
primary_key=["cc_num"],
event_time="datetime",
- parents=[daily_transaction_fg]
+ parents=[daily_transaction_fg],
)
monthly_transaction_fg.insert(monthly_transaction_df)
```
diff --git a/docs/user_guides/fs/sharing/sharing.md b/docs/user_guides/fs/sharing/sharing.md
index 23cc6a1f3..1751fcbbf 100644
--- a/docs/user_guides/fs/sharing/sharing.md
+++ b/docs/user_guides/fs/sharing/sharing.md
@@ -145,13 +145,11 @@ shared_feature_store = project.get_feature_store(name="name_of_shared_feature_st
```python
# Fetch a feature group from the shared feature store
shared_fg = shared_feature_store.get_feature_group(
- name="shared_fg_name",
- version=1
+ name="shared_fg_name", version=1
)
# Fetch a feature group from your project's feature store
fg = project_feature_store.get_or_create_feature_group(
- name="feature_group_name",
- version=1
+ name="feature_group_name", version=1
)
```
diff --git a/docs/user_guides/fs/tags/tags.md b/docs/user_guides/fs/tags/tags.md
index 10c43e763..f51ccceee 100644
--- a/docs/user_guides/fs/tags/tags.md
+++ b/docs/user_guides/fs/tags/tags.md
@@ -87,9 +87,9 @@ You can attach tags to feature groups and feature views by using the `add_tag()`
# Define the tag
tag = {
- 'business_unit': 'Fraud',
- 'data_owner': 'email@hopsworks.ai',
- 'pii': True
+ "business_unit": "Fraud",
+ "data_owner": "email@hopsworks.ai",
+ "pii": True,
}
# Attach the tag
diff --git a/docs/user_guides/fs/transformation_functions.md b/docs/user_guides/fs/transformation_functions.md
index cefc6e020..413fc09dc 100644
--- a/docs/user_guides/fs/transformation_functions.md
+++ b/docs/user_guides/fs/transformation_functions.md
@@ -79,6 +79,7 @@ The transformation function should take one argument as input and return a Panda
```python
from hopsworks import udf
+
@udf(return_type=int)
def add_one(feature):
return feature + 1
@@ -94,6 +95,7 @@ The creation of many-to-one transformation functions is similar to that of a one
```python
from hopsworks import udf
+
@udf(return_type=int)
def add_features(feature1, feature2, feature3):
return feature1 + feature2 + feature3
@@ -109,7 +111,7 @@ The return types provided to the decorator must match the types of each column i
```python
from hopsworks import udf
- import pandas as pd
+
@udf(return_type=[int, int])
def add_one_and_two(feature1):
@@ -125,7 +127,7 @@ The creation of a many-to-many transformation function is similar to that of a o
```python
from hopsworks import udf
- import pandas as pd
+
@udf(return_type=[int, int, int])
def add_one_multiple(feature1, feature2, feature3):
@@ -148,13 +150,14 @@ In this mode, the transformation function is executed as a Pandas UDF during tra
```python
from hopsworks import udf
- import pandas as pd
+
# "default" mode is used if the parameter `mode` is not explicitly set.
@udf(return_type=[int, int, int])
def add_one_multiple(feature1, feature2, feature3):
return feature1 + 1, feature2 + 1, feature3 + 1
+
@udf(return_type=[int, int, int], mode="default")
def add_two_multiple(feature1, feature2, feature3):
return feature1 + 2, feature2 + 2, feature3 + 2
@@ -169,9 +172,9 @@ The transformation function can be configured to always execute as a Python UDF
```python
from hopsworks import udf
- import pandas as pd
- @udf(return_type=[int, int, int], mode = "python")
+
+ @udf(return_type=[int, int, int], mode="python")
def add_one_multiple(feature1, feature2, feature3):
return feature1 + 1, feature2 + 1, feature3 + 1
```
@@ -184,13 +187,21 @@ The transformation function can be configured to always execute as a Pandas UDF
=== "Python"
```python
- from hopsworks import udf
import pandas as pd
+ from hopsworks import udf
+
# A Pandas UDF returning a Pandas DataFrame
- @udf(return_type=[int, int, int], mode = "pandas")
+ @udf(return_type=[int, int, int], mode="pandas")
def add_one_multiple(feature1, feature2, feature3):
- return pd.DataFrame({"add_one_feature1":feature1 + 1, "add_one_feature2":feature2 + 1, "add_one_feature3":feature3 + 1})
+ return pd.DataFrame(
+ {
+ "add_one_feature1": feature1 + 1,
+ "add_one_feature2": feature2 + 1,
+ "add_one_feature3": feature3 + 1,
+ }
+ )
+
# A Pandas UDF returning multiple Pandas Series
@udf(return_type=[int, int, int], mode="pandas")
@@ -208,7 +219,7 @@ In the example below, the columns mapped to the arguments `feature1` and `fea
```python
from hopsworks import udf
- import pandas as pd
+
@udf(return_type=[int, int, int], drop=["feature1", "feature3"])
def add_one_multiple(feature1, feature2, feature3):
@@ -226,14 +237,17 @@ If no name is provided via the `alias` function, Hopsworks generates default out
```python
from hopsworks import udf
- import pandas as pd
+
@udf(return_type=[int, int, int], drop=["feature1", "feature3"])
def add_one_multiple(feature1, feature2, feature3):
return feature1 + 1, feature2 + 1, feature3 + 1
+
# Specifying output feature names of the transformation function.
- add_one_multiple.alias("transformed_feature1", "transformed_feature2", "transformed_feature3")
+ add_one_multiple.alias(
+ "transformed_feature1", "transformed_feature2", "transformed_feature3"
+ )
```
### Training dataset statistics
@@ -260,9 +274,17 @@ Upon instantiation, instances of `FeatureTransformationStatistics` contain `N
stats = TransformationStatistics("argument1", "argument2", "argument3")
+
@udf(int)
def add_features(argument1, argument2, argument3, statistics=stats):
- return argument1 + argument2 + argument3 + statistics.argument1.mean + statistics.argument2.mean + statistics.argument3.mean
+ return (
+ argument1
+ + argument2
+ + argument3
+ + statistics.argument1.mean
+ + statistics.argument2.mean
+ + statistics.argument3.mean
+ )
```
### Passing context variables to transformation function
@@ -277,6 +299,7 @@ By including the context argument, you can pass the necessary data as a dictiona
```python
from hopsworks import udf
+
@udf(int)
def add_features(argument1, context):
return argument1 + context["value_to_add"]
@@ -292,8 +315,8 @@ The save function will throw an error if another transformation function with th
```python
plus_one_meta = fs.create_transformation_function(
- transformation_function=add_one,
- version=1)
+ transformation_function=add_one, version=1
+ )
plus_one_meta.save()
```
diff --git a/docs/user_guides/integrations/databricks/api_key.md b/docs/user_guides/integrations/databricks/api_key.md
index 659b14c0a..810d50795 100644
--- a/docs/user_guides/integrations/databricks/api_key.md
+++ b/docs/user_guides/integrations/databricks/api_key.md
@@ -19,13 +19,14 @@ For the Databricks integration to work make sure you add the following scopes to
```python hl_lines="6"
import hopsworks
+
project = hopsworks.login(
- host='my_instance', # DNS of your Feature Store instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='my_project', # Name of your Hopsworks Feature Store project
- api_key_value='apikey', # The API key to authenticate with Hopsworks
+ host="my_instance", # DNS of your Feature Store instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="my_project", # Name of your Hopsworks Feature Store project
+ api_key_value="apikey", # The API key to authenticate with Hopsworks
)
- fs = project.get_feature_store() # Get the project's default feature store
+ fs = project.get_feature_store() # Get the project's default feature store
```
## Next Steps
diff --git a/docs/user_guides/integrations/databricks/configuration.md b/docs/user_guides/integrations/databricks/configuration.md
index 6e8d8b113..150de3617 100644
--- a/docs/user_guides/integrations/databricks/configuration.md
+++ b/docs/user_guides/integrations/databricks/configuration.md
@@ -101,13 +101,14 @@ Once the cluster is running users can establish a connection to the Hopsworks Fe
```python
import hopsworks
+
project = hopsworks.login(
- host='my_instance', # DNS of your Hopsworks instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='my_project', # Name of your Hopsworks project
- api_key_value='apikey', # The API key to authenticate with Hopsworks
+ host="my_instance", # DNS of your Hopsworks instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="my_project", # Name of your Hopsworks project
+ api_key_value="apikey", # The API key to authenticate with Hopsworks
)
-fs = project.get_feature_store() # Get the project's default feature store
+fs = project.get_feature_store() # Get the project's default feature store
```
## Next Steps
diff --git a/docs/user_guides/integrations/emr/emr_configuration.md b/docs/user_guides/integrations/emr/emr_configuration.md
index da25a9494..dc4d16bdf 100644
--- a/docs/user_guides/integrations/emr/emr_configuration.md
+++ b/docs/user_guides/integrations/emr/emr_configuration.md
@@ -177,7 +177,6 @@ echo -n $(curl -H "Authorization: ApiKey ${API_KEY}" https://$HOST/hopsworks-api
chmod -R o-rwx /usr/lib/hopsworks
sudo pip3 install --upgrade hopsworks~=X.X.0
-
```
!!! attention "Matching Hopsworks version"
diff --git a/docs/user_guides/integrations/hdinsight.md b/docs/user_guides/integrations/hdinsight.md
index 8295b9329..e0abb1f97 100644
--- a/docs/user_guides/integrations/hdinsight.md
+++ b/docs/user_guides/integrations/hdinsight.md
@@ -143,15 +143,15 @@ import hopsworks
# Put the API key into Key Vault for any production setup:
# See, https://azure.microsoft.com/en-us/services/key-vault/
-secret_value = 'MY_API_KEY'
+secret_value = "MY_API_KEY"
# Create a connection
project = hopsworks.login(
- host='MY_INSTANCE.cloud.hopsworks.ai', # DNS of your Feature Store instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='MY_PROJECT', # Name of your Hopsworks project
- api_key_value=secret_value, # The API key to authenticate with Hopsworks
- hostname_verification=True # Disable for self-signed certificates
+ host="MY_INSTANCE.cloud.hopsworks.ai", # DNS of your Feature Store instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="MY_PROJECT", # Name of your Hopsworks project
+ api_key_value=secret_value, # The API key to authenticate with Hopsworks
+ hostname_verification=True, # Disable for self-signed certificates
)
# Get the feature store handle for the project's feature store
diff --git a/docs/user_guides/integrations/mlstudio_designer.md b/docs/user_guides/integrations/mlstudio_designer.md
index 5acd2c7a3..de8bb8942 100644
--- a/docs/user_guides/integrations/mlstudio_designer.md
+++ b/docs/user_guides/integrations/mlstudio_designer.md
@@ -53,38 +53,40 @@ In the pipeline, add a new `Execute Python Script` step and replace the Python s
```python
-import os
import importlib.util
+import os
-
-package_name = 'hopsworks'
-version = 'MY_VERSION'
+package_name = "hopsworks"
+version = "MY_VERSION"
spec = importlib.util.find_spec(package_name)
if spec is None:
import os
+
os.system(f"pip install %s[python]==%s" % (package_name, version))
# Put the API key into Key Vault for any production setup:
# See, https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-secrets-in-runs
-#from azureml.core import Experiment, Run
-#run = Run.get_context()
-#secret_value = run.get_secret(name="fs-api-key")
-secret_value = 'MY_API_KEY'
+# from azureml.core import Experiment, Run
+# run = Run.get_context()
+# secret_value = run.get_secret(name="fs-api-key")
+secret_value = "MY_API_KEY"
+
-def azureml_main(dataframe1 = None, dataframe2 = None):
+def azureml_main(dataframe1=None, dataframe2=None):
import hopsworks
+
project = hopsworks.login(
- host='MY_INSTANCE.cloud.hopsworks.ai', # DNS of your Hopsworks instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='MY_PROJECT', # Name of your Hopsworks project
- api_key_value=secret_value, # The API key to authenticate with Hopsworks
- hostname_verification=True, # Disable for self-signed certificates
- engine='python' # Choose python as engine
+ host="MY_INSTANCE.cloud.hopsworks.ai", # DNS of your Hopsworks instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="MY_PROJECT", # Name of your Hopsworks project
+ api_key_value=secret_value, # The API key to authenticate with Hopsworks
+ hostname_verification=True, # Disable for self-signed certificates
+ engine="python", # Choose python as engine
)
- fs = project.get_feature_store() # Get the project's default feature store
+ fs = project.get_feature_store() # Get the project's default feature store
- return fs.get_feature_group('MY_FEATURE_GROUP', version=1).read(),
+ return (fs.get_feature_group("MY_FEATURE_GROUP", version=1).read(),)
```
Select a compute target and save the step.
diff --git a/docs/user_guides/integrations/mlstudio_notebooks.md b/docs/user_guides/integrations/mlstudio_notebooks.md
index 182b1784f..ac439bdd7 100644
--- a/docs/user_guides/integrations/mlstudio_notebooks.md
+++ b/docs/user_guides/integrations/mlstudio_notebooks.md
@@ -64,19 +64,19 @@ import hopsworks
# Put the API key into Key Vault for any production setup:
# See, https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-secrets-in-runs
-#from azureml.core import Experiment, Run
-#run = Run.get_context()
-#secret_value = run.get_secret(name="fs-api-key")
-secret_value = 'MY_API_KEY'
+# from azureml.core import Experiment, Run
+# run = Run.get_context()
+# secret_value = run.get_secret(name="fs-api-key")
+secret_value = "MY_API_KEY"
# Create a connection
project = hopsworks.login(
- host='MY_INSTANCE.cloud.hopsworks.ai', # DNS of your Hopsworks instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='MY_PROJECT', # Name of your Hopsworks project
- api_key_value=secret_value, # The API key to authenticate with Hopsworks
- hostname_verification=True, # Disable for self-signed certificates
- engine='python' # Choose Python as engine
+ host="MY_INSTANCE.cloud.hopsworks.ai", # DNS of your Hopsworks instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="MY_PROJECT", # Name of your Hopsworks project
+ api_key_value=secret_value, # The API key to authenticate with Hopsworks
+ hostname_verification=True, # Disable for self-signed certificates
+ engine="python", # Choose Python as engine
)
# Get the feature store handle for the project's feature store
diff --git a/docs/user_guides/integrations/python.md b/docs/user_guides/integrations/python.md
index 5e1bc2e66..2ab241f06 100644
--- a/docs/user_guides/integrations/python.md
+++ b/docs/user_guides/integrations/python.md
@@ -47,14 +47,15 @@ You are now ready to connect to Hopsworks from your Python environment:
```python
import hopsworks
+
project = hopsworks.login(
- host='my_instance', # DNS of your Hopsworks instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='my_project', # Name of your Hopsworks project
- api_key_value='apikey', # The API key to authenticate with Hopsworks
- engine='python', # Use the Python engine
+ host="my_instance", # DNS of your Hopsworks instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="my_project", # Name of your Hopsworks project
+ api_key_value="apikey", # The API key to authenticate with Hopsworks
+ engine="python", # Use the Python engine
)
-fs = project.get_feature_store() # Get the project's default feature store
+fs = project.get_feature_store() # Get the project's default feature store
```
!!! note "Engine"
diff --git a/docs/user_guides/integrations/spark.md b/docs/user_guides/integrations/spark.md
index 9dc774738..18722ac0d 100644
--- a/docs/user_guides/integrations/spark.md
+++ b/docs/user_guides/integrations/spark.md
@@ -83,14 +83,15 @@ You are now ready to connect to the Hopsworks Feature Store from Spark:
```python
import hopsworks
+
project = hopsworks.login(
- host='my_instance', # DNS of your Feature Store instance
- port=443, # Port to reach your Hopsworks instance, defaults to 443
- project='my_project', # Name of your Hopsworks Feature Store project
- api_key_value='api_key', # The API key to authenticate with the feature store
- hostname_verification=True # Disable for self-signed certificates
+ host="my_instance", # DNS of your Feature Store instance
+ port=443, # Port to reach your Hopsworks instance, defaults to 443
+ project="my_project", # Name of your Hopsworks Feature Store project
+ api_key_value="api_key", # The API key to authenticate with the feature store
+ hostname_verification=True, # Disable for self-signed certificates
)
-fs = project.get_feature_store() # Get the project's default feature store
+fs = project.get_feature_store() # Get the project's default feature store
```
!!! note "Engine"
diff --git a/docs/user_guides/migration/40_migration.md b/docs/user_guides/migration/40_migration.md
index 2b589b99f..58268d31f 100644
--- a/docs/user_guides/migration/40_migration.md
+++ b/docs/user_guides/migration/40_migration.md
@@ -53,8 +53,10 @@ The following is how transformation functions were used in previous versions of
def add_one(feature):
return feature + 1
+
# Create transformation function
- add_one = fs.create_transformation_function(add_one,
+ add_one = fs.create_transformation_function(
+ add_one,
output_type=int,
version=1,
)
@@ -70,14 +72,14 @@ The following is how transformation functions were used in previous versions of
# Create feature view
feature_view = fs.get_or_create_feature_view(
- name='serving_fv',
+ name="serving_fv",
version=1,
query=selected_features,
# Apply your custom transformation functions to the feature `feature_1`
transformation_functions={
"feature_1": add_one,
},
- labels=['target'],
+ labels=["target"],
)
```
@@ -93,16 +95,17 @@ The following is how transformation functions were used in previous versions of
def add_one(feature):
return feature + 1
+
# Create feature view
feature_view = fs.get_or_create_feature_view(
- name='serving_fv',
+ name="serving_fv",
version=1,
query=selected_features,
# Apply the custom transformation functions defined to the feature `feature_1`
transformation_functions=[
add_one("feature_1"),
],
- labels=['target'],
+ labels=["target"],
)
```
diff --git a/docs/user_guides/mlops/registry/frameworks/llm.md b/docs/user_guides/mlops/registry/frameworks/llm.md
index 94c64862d..65bde785c 100644
--- a/docs/user_guides/mlops/registry/frameworks/llm.md
+++ b/docs/user_guides/mlops/registry/frameworks/llm.md
@@ -34,10 +34,7 @@ LLMs can typically be downloaded using the official frameworks provided by their
# Download LLM (e.g., using huggingface to download Llama-3.1-8B base model)
from huggingface_hub import snapshot_download
- model_dir = snapshot_download(
- "meta-llama/Llama-3.1-8B",
- ignore_patterns="original/*"
- )
+ model_dir = snapshot_download("meta-llama/Llama-3.1-8B", ignore_patterns="original/*")
```
### Step 3: (Optional) Fine-tune LLM
@@ -61,7 +58,7 @@ Define a name, and attach optional metrics for your model, then invoke the `save
```python
# Model evaluation metrics
- metrics = {'f1-score': 0.8, 'perplexity': 31.62, 'bleu-score': 0.73}
+ metrics = {"f1-score": 0.8, "perplexity": 31.62, "bleu-score": 0.73}
llm_model = mr.llm.create_model("llm_model", metrics=metrics)
diff --git a/docs/user_guides/mlops/registry/frameworks/python.md b/docs/user_guides/mlops/registry/frameworks/python.md
index 6b54454bf..e13274b13 100644
--- a/docs/user_guides/mlops/registry/frameworks/python.md
+++ b/docs/user_guides/mlops/registry/frameworks/python.md
@@ -58,7 +58,7 @@ Define a name, and attach optional metrics for your model, then invoke the `save
```python
# Model evaluation metrics
- metrics = {'accuracy': 0.92}
+ metrics = {"accuracy": 0.92}
py_model = mr.python.create_model("py_model", metrics=metrics)
diff --git a/docs/user_guides/mlops/registry/frameworks/skl.md b/docs/user_guides/mlops/registry/frameworks/skl.md
index 9edce3839..b15fa2ed6 100644
--- a/docs/user_guides/mlops/registry/frameworks/skl.md
+++ b/docs/user_guides/mlops/registry/frameworks/skl.md
@@ -57,7 +57,7 @@ Define a name, and attach optional metrics for your model, then invoke the `save
```python
# Model evaluation metrics
- metrics = {'accuracy': 0.92}
+ metrics = {"accuracy": 0.92}
skl_model = mr.sklearn.create_model("skl_model", metrics=metrics)
diff --git a/docs/user_guides/mlops/registry/frameworks/tch.md b/docs/user_guides/mlops/registry/frameworks/tch.md
index 27551958b..f83831f6a 100644
--- a/docs/user_guides/mlops/registry/frameworks/tch.md
+++ b/docs/user_guides/mlops/registry/frameworks/tch.md
@@ -42,6 +42,7 @@ Define your Torch model and run the training loop.
...
return x
+
# Instantiate the model
net = Net()
@@ -71,7 +72,7 @@ Define a name, and attach optional metrics for your model, then invoke the `save
```python
# Model evaluation metrics
- metrics = {'accuracy': 0.92}
+ metrics = {"accuracy": 0.92}
tch_model = mr.torch.create_model("tch_model", metrics=metrics)
diff --git a/docs/user_guides/mlops/registry/frameworks/tf.md b/docs/user_guides/mlops/registry/frameworks/tf.md
index 97070fda8..f8ce57212 100644
--- a/docs/user_guides/mlops/registry/frameworks/tf.md
+++ b/docs/user_guides/mlops/registry/frameworks/tf.md
@@ -67,7 +67,7 @@ Define a name, and attach optional metrics for your model, then invoke the `save
```python
# Model evaluation metrics
- metrics = {'accuracy': 0.92}
+ metrics = {"accuracy": 0.92}
tf_model = mr.tensorflow.create_model("tf_model", metrics=metrics)
diff --git a/docs/user_guides/mlops/registry/input_example.md b/docs/user_guides/mlops/registry/input_example.md
index cdfd5c2e7..24ddf1f88 100644
--- a/docs/user_guides/mlops/registry/input_example.md
+++ b/docs/user_guides/mlops/registry/input_example.md
@@ -45,7 +45,6 @@ Set the `input_example` parameter in the `create_model` function and call `save(
=== "Python"
```python
- model = mr.tensorflow.create_model(name="mnist",
- input_example=input_example)
+ model = mr.tensorflow.create_model(name="mnist", input_example=input_example)
model.save("./model")
```
diff --git a/docs/user_guides/mlops/registry/model_evaluation_images.md b/docs/user_guides/mlops/registry/model_evaluation_images.md
index 64d507c7c..036ff9cad 100644
--- a/docs/user_guides/mlops/registry/model_evaluation_images.md
+++ b/docs/user_guides/mlops/registry/model_evaluation_images.md
@@ -48,8 +48,8 @@ Generate an image that visualizes model performance and evaluation metrics
# Create a DataFrame for the confusion matrix results
df_confusion_matrix = pd.DataFrame(
results,
- ['True Normal', 'True Fraud'],
- ['Pred Normal', 'Pred Fraud'],
+ ["True Normal", "True Fraud"],
+ ["Pred Normal", "Pred Fraud"],
)
# Create a heatmap using seaborn with annotations
diff --git a/docs/user_guides/mlops/registry/model_schema.md b/docs/user_guides/mlops/registry/model_schema.md
index 98e6e98ac..e8d4de29c 100644
--- a/docs/user_guides/mlops/registry/model_schema.md
+++ b/docs/user_guides/mlops/registry/model_schema.md
@@ -38,13 +38,19 @@ Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`.
from hsml.utils.schema import Schema
# Model inputs for MNIST dataset
- inputs = [{'type': 'uint8', 'shape': [28, 28, 1], 'description': 'grayscale representation of 28x28 MNIST images'}]
+ inputs = [
+ {
+ "type": "uint8",
+ "shape": [28, 28, 1],
+ "description": "grayscale representation of 28x28 MNIST images",
+ }
+ ]
# Build the input schema
input_schema = Schema(inputs)
# Model outputs
- outputs = [{'type': 'float32', 'shape': [10]}]
+ outputs = [{"type": "float32", "shape": [10]}]
# Build the output schema
output_schema = Schema(outputs)
@@ -60,7 +66,6 @@ Set the `model_schema` parameter in the `create_model` function and call `save()
=== "Python"
```python
- model = mr.tensorflow.create_model(name="mnist",
- model_schema=model_schema)
+ model = mr.tensorflow.create_model(name="mnist", model_schema=model_schema)
model.save("./model")
```
diff --git a/docs/user_guides/mlops/serving/api-protocol.md b/docs/user_guides/mlops/serving/api-protocol.md
index cea116f26..b5f11e898 100644
--- a/docs/user_guides/mlops/serving/api-protocol.md
+++ b/docs/user_guides/mlops/serving/api-protocol.md
@@ -93,7 +93,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott
my_predictor = ms.create_predictor(
my_model,
- api_protocol="GRPC" # defaults to "REST"
+ api_protocol="GRPC", # defaults to "REST"
)
my_predictor.deploy()
@@ -101,6 +101,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott
my_deployment = ms.create_deployment(my_predictor)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/deployment-state.md b/docs/user_guides/mlops/serving/deployment-state.md
index d73068398..7bc60e581 100644
--- a/docs/user_guides/mlops/serving/deployment-state.md
+++ b/docs/user_guides/mlops/serving/deployment-state.md
@@ -85,6 +85,8 @@ Additionally, you can find the nº of instances currently running by scrolling d
```python
deployment = ms.get_deployment("mydeployment")
+
+
```
### Step 3: Inspect deployment state
@@ -95,6 +97,8 @@ Additionally, you can find the nº of instances currently running by scrolling d
state = deployment.get_state()
state.describe()
+
+
```
### Step 4: Check nº of running instances
@@ -107,6 +111,8 @@ Additionally, you can find the nº of instances currently running by scrolling d
# nº of transformer instances
deployment.transformer.resources.describe()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/deployment.md b/docs/user_guides/mlops/serving/deployment.md
index 488989143..c514a0fd8 100644
--- a/docs/user_guides/mlops/serving/deployment.md
+++ b/docs/user_guides/mlops/serving/deployment.md
@@ -158,6 +158,8 @@ Retrieve the trained model you want to deploy.
```python
my_model = mr.get_model("my_model", version=1)
+
+
```
#### Option A: Using the model object
@@ -166,6 +168,8 @@ Retrieve the trained model you want to deploy.
```python
my_deployment = my_model.deploy()
+
+
```
#### Option B: Using the Model Serving handle
@@ -182,6 +186,8 @@ Retrieve the trained model you want to deploy.
# or
my_deployment = ms.create_deployment(my_predictor)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/inference-batcher.md b/docs/user_guides/mlops/serving/inference-batcher.md
index 8dc94ed58..d978b900d 100644
--- a/docs/user_guides/mlops/serving/inference-batcher.md
+++ b/docs/user_guides/mlops/serving/inference-batcher.md
@@ -75,12 +75,13 @@ Once you are done with the changes, click on `Create new deployment` at the bott
```python
from hsml.inference_batcher import InferenceBatcher
- my_batcher = InferenceBatcher(enabled=True,
- # optional
- max_batch_size=32,
- max_latency=5000, # milliseconds
- timeout=5 # seconds
- )
+ my_batcher = InferenceBatcher(
+ enabled=True,
+ # optional
+ max_batch_size=32,
+ max_latency=5000, # milliseconds
+ timeout=5, # seconds
+ )
```
### Step 3: Create a deployment with the inference batcher
@@ -88,18 +89,17 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Python"
```python
-
my_model = mr.get_model("my_model", version=1)
- my_predictor = ms.create_predictor(my_model,
- inference_batcher=my_batcher
- )
+ my_predictor = ms.create_predictor(my_model, inference_batcher=my_batcher)
my_predictor.deploy()
# or
my_deployment = ms.create_deployment(my_predictor)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/inference-logger.md b/docs/user_guides/mlops/serving/inference-logger.md
index 47f8ebaea..64012324b 100644
--- a/docs/user_guides/mlops/serving/inference-logger.md
+++ b/docs/user_guides/mlops/serving/inference-logger.md
@@ -78,15 +78,15 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Python"
```python
-
from hsml.inference_logger import InferenceLogger
from hsml.kafka_topic import KafkaTopic
- new_topic = KafkaTopic(name="CREATE",
- # optional
- num_partitions=1,
- num_replicas=1
- )
+ new_topic = KafkaTopic(
+ name="CREATE",
+ # optional
+ num_partitions=1,
+ num_replicas=1,
+ )
my_logger = InferenceLogger(kafka_topic=new_topic, mode="ALL")
```
@@ -95,7 +95,6 @@ Once you are done with the changes, click on `Create new deployment` at the bott
Similarly, you can create the same logger with:
```python
-
my_logger = InferenceLogger(kafka_topic={"name": "CREATE"}, mode="ALL")
```
@@ -106,15 +105,15 @@ Once you are done with the changes, click on `Create new deployment` at the bott
```python
my_model = mr.get_model("my_model", version=1)
- my_predictor = ms.create_predictor(my_model,
- inference_logger=my_logger
- )
+ my_predictor = ms.create_predictor(my_model, inference_logger=my_logger)
my_predictor.deploy()
# or
my_deployment = ms.create_deployment(my_predictor)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/predictor.md b/docs/user_guides/mlops/serving/predictor.md
index 049bfe852..4470ffac7 100644
--- a/docs/user_guides/mlops/serving/predictor.md
+++ b/docs/user_guides/mlops/serving/predictor.md
@@ -173,15 +173,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Predictor"
``` python
- class Predictor():
-
+ class Predictor:
def __init__(self):
- """ Initialization code goes here"""
+ """Initialization code goes here"""
# Model files can be found at os.environ["MODEL_FILES_PATH"]
# self.model = ... # load your model
def predict(self, inputs):
- """ Serve predictions using the trained model"""
+ """Serve predictions using the trained model"""
# Use the model to make predictions
# return self.model.predict(inputs)
```
@@ -189,15 +188,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Async Predictor"
``` python
- class Predictor():
-
+ class Predictor:
def __init__(self):
- """ Initialization code goes here"""
+ """Initialization code goes here"""
# Model files can be found at os.environ["MODEL_FILES_PATH"]
# self.model = ... # load your model
async def predict(self, inputs):
- """ Asynchronously serve predictions using the trained model"""
+ """Asynchronously serve predictions using the trained model"""
# Perform async operations that required
# result = await some_async_preprocessing(inputs)
@@ -274,8 +272,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Python"
```python
- uploaded_file_path = dataset_api.upload("my_predictor.py", "Resources", overwrite=True)
- predictor_script_path = os.path.join("/Projects", project.name, uploaded_file_path)
+ uploaded_file_path = dataset_api.upload(
+ "my_predictor.py", "Resources", overwrite=True
+ )
+ predictor_script_path = os.path.join(
+ "/Projects", project.name, uploaded_file_path
+ )
+
+
```
### Step 4: Define predictor
@@ -285,12 +289,15 @@ Once you are done with the changes, click on `Create new deployment` at the bott
```python
my_model = mr.get_model("my_model", version=1)
- my_predictor = ms.create_predictor(my_model,
- # optional
- model_server="PYTHON",
- serving_tool="KSERVE",
- script_file=predictor_script_path
- )
+ my_predictor = ms.create_predictor(
+ my_model,
+ # optional
+ model_server="PYTHON",
+ serving_tool="KSERVE",
+ script_file=predictor_script_path,
+ )
+
+
```
### Step 5: Create a deployment with the predictor
@@ -303,6 +310,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott
# or
my_deployment = ms.create_deployment(my_predictor)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/resources.md b/docs/user_guides/mlops/serving/resources.md
index 32d99adba..2ba350d1a 100644
--- a/docs/user_guides/mlops/serving/resources.md
+++ b/docs/user_guides/mlops/serving/resources.md
@@ -84,7 +84,9 @@ Once you are done with the changes, click on `Create new deployment` at the bott
minimum_res = Resources(cores=1, memory=128, gpus=1)
maximum_res = Resources(cores=2, memory=256, gpus=1)
- predictor_res = PredictorResources(num_instances=1, requests=minimum_res, limits=maximum_res)
+ predictor_res = PredictorResources(
+ num_instances=1, requests=minimum_res, limits=maximum_res
+ )
```
### Step 3 (Optional): Define the transformer resource configuration
@@ -97,7 +99,11 @@ Once you are done with the changes, click on `Create new deployment` at the bott
minimum_res = Resources(cores=1, memory=128, gpus=1)
maximum_res = Resources(cores=2, memory=256, gpus=1)
- transformer_res = TransformerResources(num_instances=2, requests=minimum_res, limits=maximum_res)
+ transformer_res = TransformerResources(
+ num_instances=2, requests=minimum_res, limits=maximum_res
+ )
+
+
```
### Step 4: Create a deployment with the resource configuration
@@ -107,17 +113,20 @@ Once you are done with the changes, click on `Create new deployment` at the bott
```python
my_model = mr.get_model("my_model", version=1)
- my_predictor = ms.create_predictor(my_model,
- resources=predictor_res,
- # transformer=Transformer(script_file,
- # resources=transformer_res)
- )
+ my_predictor = ms.create_predictor(
+ my_model,
+ resources=predictor_res,
+ # transformer=Transformer(script_file,
+ # resources=transformer_res)
+ )
my_predictor.deploy()
# or
my_deployment = ms.create_deployment(my_predictor)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/rest-api.md b/docs/user_guides/mlops/serving/rest-api.md
index d355bf4aa..d7e99de1e 100644
--- a/docs/user_guides/mlops/serving/rest-api.md
+++ b/docs/user_guides/mlops/serving/rest-api.md
@@ -54,27 +54,20 @@ An example for this is given below.
```python
import requests
- data = {
- "inputs": [
- [
- 4641025220953719,
- 4920355418495856
- ]
- ]
- }
+ data = {"inputs": [[4641025220953719, 4920355418495856]]}
headers = {
"Host": "fraud.test.hopsworks.ai",
"Authorization": "ApiKey 8kDOlnRlJU4kiV1Y.RmFNJY3XKAUSqmJZ03kbUbXKMQSHveSBgMIGT84qrM5qXMjLib7hdlfGeg8fBQZp",
- "Content-Type": "application/json"
+ "Content-Type": "application/json",
}
response = requests.post(
- "http://10.87.42.108/v1/models/fraud:predict",
- headers=headers,
- json=data
+ "http://10.87.42.108/v1/models/fraud:predict", headers=headers, json=data
)
print(response.json())
+
+
```
=== "Curl"
diff --git a/docs/user_guides/mlops/serving/transformer.md b/docs/user_guides/mlops/serving/transformer.md
index b2606206f..9abf279d5 100644
--- a/docs/user_guides/mlops/serving/transformer.md
+++ b/docs/user_guides/mlops/serving/transformer.md
@@ -115,17 +115,17 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Transformer"
```python
- class Transformer():
+ class Transformer:
def __init__(self):
- """ Initialization code goes here"""
+ """Initialization code goes here"""
pass
def preprocess(self, inputs):
- """ Transform the requests inputs here. The object returned by this method will be used as model input to make predictions. """
+ """Transform the requests inputs here. The object returned by this method will be used as model input to make predictions."""
return inputs
def postprocess(self, outputs):
- """ Transform the predictions computed by the model before returning a response """
+ """Transform the predictions computed by the model before returning a response"""
return outputs
```
@@ -139,8 +139,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott
=== "Python"
```python
- uploaded_file_path = dataset_api.upload("my_transformer.py", "Resources", overwrite=True)
- transformer_script_path = os.path.join("/Projects", project.name, uploaded_file_path)
+ uploaded_file_path = dataset_api.upload(
+ "my_transformer.py", "Resources", overwrite=True
+ )
+ transformer_script_path = os.path.join(
+ "/Projects", project.name, uploaded_file_path
+ )
+
+
```
### Step 4: Define a transformer
@@ -155,6 +161,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott
from hsml.transformer import Transformer
my_transformer = Transformer(script_file)
+
+
```
### Step 5: Create a deployment with the transformer
@@ -168,6 +176,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott
# or
my_deployment = ms.create_deployment(my_predictor, transformer=my_transformer)
my_deployment.save()
+
+
```
### API Reference
diff --git a/docs/user_guides/mlops/serving/troubleshooting.md b/docs/user_guides/mlops/serving/troubleshooting.md
index ba0ce1ac1..f02d942ab 100644
--- a/docs/user_guides/mlops/serving/troubleshooting.md
+++ b/docs/user_guides/mlops/serving/troubleshooting.md
@@ -134,6 +134,8 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f
```python
deployment = ms.get_deployment("mydeployment")
+
+
```
### Step 3: Get current deployment's predictor state
@@ -144,6 +146,8 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f
state = deployment.get_state()
state.describe()
+
+
```
### Step 4: Explore transient logs
@@ -152,6 +156,8 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f
```python
deployment.get_logs(component="predictor|transformer", tail=10)
+
+
```
### API Reference
diff --git a/docs/user_guides/projects/airflow/airflow.md b/docs/user_guides/projects/airflow/airflow.md
index 54883e12c..8d2bb6f63 100644
--- a/docs/user_guides/projects/airflow/airflow.md
+++ b/docs/user_guides/projects/airflow/airflow.md
@@ -66,12 +66,14 @@ The Airflow DAGs are stored in the _Airflow_ dataset which you can access using
When writing the code for the DAG you can invoke the operator as follows:
```python
-HopsworksLaunchOperator(dag=dag,
- task_id="profiles_fg_0",
- project_name="airflow_doc",
- job_name="profiles_fg",
- job_arguments="",
- wait_for_completion=True)
+HopsworksLaunchOperator(
+ dag=dag,
+ task_id="profiles_fg_0",
+ project_name="airflow_doc",
+ job_name="profiles_fg",
+ job_arguments="",
+ wait_for_completion=True,
+)
```
You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`, `job_arguments`).
@@ -81,10 +83,12 @@ Similarly, you can invoke the sensor as shown below.
You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`)
```python
-HopsworksJobSuccessSensor(dag=dag,
- task_id='wait_for_profiles_fg',
- project_name="airflow_doc",
- job_name='profiles_fg')
+HopsworksJobSuccessSensor(
+ dag=dag,
+ task_id="wait_for_profiles_fg",
+ project_name="airflow_doc",
+ job_name="profiles_fg",
+)
```
When writing the DAG file, you should also add the `access_control` parameter to the DAG configuration.
@@ -96,13 +100,12 @@ If you do not specify the `access_control` option, project members will not be a
```python
dag = DAG(
- dag_id = "example_dag",
- default_args = args,
- access_control = {
+ dag_id="example_dag",
+ default_args=args,
+ access_control={
"project_name": {"can_dag_read", "can_dag_edit"},
},
-
- schedule_interval = "0 4 * * *"
+ schedule_interval="0 4 * * *",
)
```
diff --git a/docs/user_guides/projects/git/clone_repo.md b/docs/user_guides/projects/git/clone_repo.md
index 342ed7b07..04766d46e 100644
--- a/docs/user_guides/projects/git/clone_repo.md
+++ b/docs/user_guides/projects/git/clone_repo.md
@@ -85,26 +85,26 @@ You can also clone a repository through the hopsworks git API in python.
### Step 1: Get the git API
```python
-
import hopsworks
project = hopsworks.login()
git_api = project.get_git_api()
-
```
### Step 2: Clone the repository
```python
-
-REPO_URL="https://github.com/logicalclocks/hops-examples.git" # git repository
-HOPSWORKS_FOLDER="Jupyter" # path in Hopsworks filesystem to clone to
-PROVIDER="GitHub"
-BRANCH="master" # optional branch to clone
-
-examples_repo = git_api.clone(REPO_URL, HOPSWORKS_FOLDER, PROVIDER, branch=BRANCH)
-
+REPO_URL = (
+ "https://github.com/logicalclocks/hops-examples.git" # git repository
+)
+HOPSWORKS_FOLDER = "Jupyter" # path in Hopsworks filesystem to clone to
+PROVIDER = "GitHub"
+BRANCH = "master" # optional branch to clone
+
+examples_repo = git_api.clone(
+ REPO_URL, HOPSWORKS_FOLDER, PROVIDER, branch=BRANCH
+)
```
### API Reference
diff --git a/docs/user_guides/projects/git/configure_git_provider.md b/docs/user_guides/projects/git/configure_git_provider.md
index 5ee1ab7a1..25a95319e 100644
--- a/docs/user_guides/projects/git/configure_git_provider.md
+++ b/docs/user_guides/projects/git/configure_git_provider.md
@@ -63,25 +63,21 @@ You can also configure a git provider using the hopsworks git API in python.
### Step 1: Get the git API
```python
-
import hopsworks
project = hopsworks.login()
git_api = project.get_git_api()
-
```
### Step 2: Configure git provider
```python
-
-PROVIDER="GitHub"
-GITHUB_USER="my_user"
-API_TOKEN="my_token"
+PROVIDER = "GitHub"
+GITHUB_USER = "my_user"
+API_TOKEN = "my_token"
git_api.set_provider(PROVIDER, GITHUB_USER, API_TOKEN)
-
```
### API Reference
diff --git a/docs/user_guides/projects/git/repository_actions.md b/docs/user_guides/projects/git/repository_actions.md
index c7c0095ca..3ea1e8fec 100644
--- a/docs/user_guides/projects/git/repository_actions.md
+++ b/docs/user_guides/projects/git/repository_actions.md
@@ -45,27 +45,23 @@ You can also perform the repository actions using the hopsworks git API in pytho
### Step 1: Get the git API
```python
-
import hopsworks
project = hopsworks.login()
git_api = project.get_git_api()
-
```
### Step 2: Get the git repository
```python
git_repo = git_api.get_repo(REPOSITORY_NAME)
-
```
### Step 3: Perform the git repository action e.g commit
```python
git_repo = git_api.commit("Test commit")
-
```
### API Reference
diff --git a/docs/user_guides/projects/jobs/notebook_job.md b/docs/user_guides/projects/jobs/notebook_job.md
index 6b671b441..2630f1a9a 100644
--- a/docs/user_guides/projects/jobs/notebook_job.md
+++ b/docs/user_guides/projects/jobs/notebook_job.md
@@ -145,7 +145,6 @@ This snippet assumes the Jupyter Notebook script is in the current working direc
It will upload the Jupyter Notebook script to the `Resources` dataset in your project.
```python
-
import hopsworks
project = hopsworks.login()
@@ -153,7 +152,6 @@ project = hopsworks.login()
dataset_api = project.get_dataset_api()
uploaded_file_path = dataset_api.upload("notebook.ipynb", "Resources")
-
```
### Step 2: Create Jupyter Notebook job
@@ -161,19 +159,17 @@ uploaded_file_path = dataset_api.upload("notebook.ipynb", "Resources")
In this snippet we get the `JobsApi` object to get the default job configuration for a `PYTHON` job, set the jupyter notebook file and override the environment to run in, and finally create the `Job` object.
```python
-
jobs_api = project.get_job_api()
notebook_job_config = jobs_api.get_configuration("PYTHON")
# Set the application file
-notebook_job_config['appPath'] = uploaded_file_path
+notebook_job_config["appPath"] = uploaded_file_path
# Override the python job environment
-notebook_job_config['environmentName'] = "python-feature-pipeline"
+notebook_job_config["environmentName"] = "python-feature-pipeline"
job = jobs_api.create_job("notebook_job", notebook_job_config)
-
```
### Step 3: Execute the job
@@ -181,9 +177,8 @@ job = jobs_api.create_job("notebook_job", notebook_job_config)
In this code snippet, we execute the job with arguments and wait until it reaches a terminal state.
```python
-
# Run the job
-execution = job.run(args='-p a 2 -p b 5', await_termination=True)
+execution = job.run(args="-p a 2 -p b 5", await_termination=True)
```
## Configuration
diff --git a/docs/user_guides/projects/jobs/pyspark_job.md b/docs/user_guides/projects/jobs/pyspark_job.md
index 2921f7bcf..5ac93555c 100644
--- a/docs/user_guides/projects/jobs/pyspark_job.md
+++ b/docs/user_guides/projects/jobs/pyspark_job.md
@@ -174,7 +174,6 @@ This snippet assumes the program to run is in the current working directory and
It will upload the python script to the `Resources` dataset in your project.
```python
-
import hopsworks
project = hopsworks.login()
@@ -182,7 +181,6 @@ project = hopsworks.login()
dataset_api = project.get_dataset_api()
uploaded_file_path = dataset_api.upload("script.py", "Resources")
-
```
### Step 2: Create PySpark job
@@ -190,19 +188,17 @@ uploaded_file_path = dataset_api.upload("script.py", "Resources")
In this snippet we get the `JobsApi` object to get the default job configuration for a `PYSPARK` job, set the pyspark script and override the environment to run in, and finally create the `Job` object.
```python
-
jobs_api = project.get_job_api()
spark_config = jobs_api.get_configuration("PYSPARK")
# Set the application file
-spark_config['appPath'] = uploaded_file_path
+spark_config["appPath"] = uploaded_file_path
# Override the python job environment
-spark_config['environmentName'] = "spark-feature-pipeline"
+spark_config["environmentName"] = "spark-feature-pipeline"
job = jobs_api.create_job("pyspark_job", spark_config)
-
```
### Step 3: Execute the job
@@ -210,7 +206,6 @@ job = jobs_api.create_job("pyspark_job", spark_config)
In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs.
```python
-
execution = job.run(await_termination=True)
out, err = execution.download_logs()
@@ -220,7 +215,6 @@ print(f_out.read())
f_err = open(err, "r")
print(f_err.read())
-
```
## Configuration
@@ -259,7 +253,9 @@ To read a dataset in your project using Spark, use the full filesystem path wher
For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`:
```python
-df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True)
+df = spark.read.csv(
+ "/Projects/my_project/Resources/data.csv", header=True, inferSchema=True
+)
df.show()
```
diff --git a/docs/user_guides/projects/jobs/python_job.md b/docs/user_guides/projects/jobs/python_job.md
index 5252f5081..9ab3453a2 100644
--- a/docs/user_guides/projects/jobs/python_job.md
+++ b/docs/user_guides/projects/jobs/python_job.md
@@ -130,7 +130,6 @@ This snippet assumes the python script is in the current working directory and n
It will upload the python script to the `Resources` dataset in your project.
```python
-
import hopsworks
project = hopsworks.login()
@@ -138,7 +137,6 @@ project = hopsworks.login()
dataset_api = project.get_dataset_api()
uploaded_file_path = dataset_api.upload("script.py", "Resources")
-
```
### Step 2: Create Python job
@@ -146,19 +144,17 @@ uploaded_file_path = dataset_api.upload("script.py", "Resources")
In this snippet we get the `JobsApi` object to get the default job configuration for a `PYTHON` job, set the python script and override the environment to run in, and finally create the `Job` object.
```python
-
jobs_api = project.get_job_api()
py_job_config = jobs_api.get_configuration("PYTHON")
# Set the application file
-py_job_config['appPath'] = uploaded_file_path
+py_job_config["appPath"] = uploaded_file_path
# Override the python job environment
-py_job_config['environmentName'] = "python-feature-pipeline"
+py_job_config["environmentName"] = "python-feature-pipeline"
job = jobs_api.create_job("py_job", py_job_config)
-
```
### Step 3: Execute the job
@@ -166,7 +162,6 @@ job = jobs_api.create_job("py_job", py_job_config)
In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs.
```python
-
# Run the job
execution = job.run(await_termination=True)
@@ -178,7 +173,6 @@ print(f_out.read())
f_err = open(err, "r")
print(f_err.read())
-
```
## Configuration
diff --git a/docs/user_guides/projects/jobs/ray_job.md b/docs/user_guides/projects/jobs/ray_job.md
index e558ff58a..0c6df6c86 100644
--- a/docs/user_guides/projects/jobs/ray_job.md
+++ b/docs/user_guides/projects/jobs/ray_job.md
@@ -175,7 +175,6 @@ If the file is already in the project, you can skip this step.
It will upload the jar to the `Resources` dataset in your project.
```python
-
import hopsworks
project = hopsworks.login()
@@ -183,7 +182,6 @@ project = hopsworks.login()
dataset_api = project.get_dataset_api()
uploaded_file_path = dataset_api.upload("ray_job.py", "Resources")
-
```
### Step 2: Create Ray job
@@ -191,22 +189,20 @@ uploaded_file_path = dataset_api.upload("ray_job.py", "Resources")
In this snippet we get the `JobsApi` object to get the default job configuration for a `RAY` job, set the python script to run and create the `Job` object.
```python
-
jobs_api = project.get_job_api()
ray_config = jobs_api.get_configuration("RAY")
-ray_config['appPath'] = uploaded_file_path
-ray_config['environmentName'] = "ray-training-pipeline"
-ray_config['driverCores'] = 2
-ray_config['driverMemory'] = 2048
-ray_config['workerCores'] = 2
-ray_config['workerMemory'] = 4096
-ray_config['minWorkers'] = 1
-ray_config['maxWorkers'] = 4
+ray_config["appPath"] = uploaded_file_path
+ray_config["environmentName"] = "ray-training-pipeline"
+ray_config["driverCores"] = 2
+ray_config["driverMemory"] = 2048
+ray_config["workerCores"] = 2
+ray_config["workerMemory"] = 4096
+ray_config["minWorkers"] = 1
+ray_config["maxWorkers"] = 4
job = jobs_api.create_job("ray_job", ray_config)
-
```
### Step 3: Execute the job
@@ -214,7 +210,6 @@ job = jobs_api.create_job("ray_job", ray_config)
In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs.
```python
-
execution = job.run(await_termination=True)
out, err = execution.download_logs()
@@ -224,7 +219,6 @@ print(f_out.read())
f_err = open(err, "r")
print(f_err.read())
-
```
## Configuration
diff --git a/docs/user_guides/projects/jobs/spark_job.md b/docs/user_guides/projects/jobs/spark_job.md
index 1fb25ff61..925b6d97d 100644
--- a/docs/user_guides/projects/jobs/spark_job.md
+++ b/docs/user_guides/projects/jobs/spark_job.md
@@ -178,7 +178,6 @@ This snippet assumes the Spark program is in the current working directory and n
It will upload the jar to the `Resources` dataset in your project.
```python
-
import hopsworks
project = hopsworks.login()
@@ -186,7 +185,6 @@ project = hopsworks.login()
dataset_api = project.get_dataset_api()
uploaded_file_path = dataset_api.upload("sparkpi.jar", "Resources")
-
```
### Step 2: Create Spark job
@@ -194,16 +192,14 @@ uploaded_file_path = dataset_api.upload("sparkpi.jar", "Resources")
In this snippet we get the `JobsApi` object to get the default job configuration for a `SPARK` job, set the python script to run and create the `Job` object.
```python
-
jobs_api = project.get_job_api()
spark_config = jobs_api.get_configuration("SPARK")
-spark_config['appPath'] = uploaded_file_path
-spark_config['mainClass'] = 'org.apache.spark.examples.SparkPi'
+spark_config["appPath"] = uploaded_file_path
+spark_config["mainClass"] = "org.apache.spark.examples.SparkPi"
job = jobs_api.create_job("pyspark_job", spark_config)
-
```
### Step 3: Execute the job
@@ -211,7 +207,6 @@ job = jobs_api.create_job("pyspark_job", spark_config)
In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs.
```python
-
execution = job.run(await_termination=True)
out, err = execution.download_logs()
@@ -221,7 +216,6 @@ print(f_out.read())
f_err = open(err, "r")
print(f_err.read())
-
```
## Configuration
diff --git a/docs/user_guides/projects/jupyter/spark_notebook.md b/docs/user_guides/projects/jupyter/spark_notebook.md
index 17eeb2167..f0ad43bc8 100644
--- a/docs/user_guides/projects/jupyter/spark_notebook.md
+++ b/docs/user_guides/projects/jupyter/spark_notebook.md
@@ -147,7 +147,9 @@ To read a dataset in your project using Spark, use the full filesystem path wher
For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`:
```python
-df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True)
+df = spark.read.csv(
+ "/Projects/my_project/Resources/data.csv", header=True, inferSchema=True
+)
df.show()
```
diff --git a/docs/user_guides/projects/kafka/consume_messages.md b/docs/user_guides/projects/kafka/consume_messages.md
index 6874d903a..72e7be6e1 100644
--- a/docs/user_guides/projects/kafka/consume_messages.md
+++ b/docs/user_guides/projects/kafka/consume_messages.md
@@ -16,39 +16,33 @@ In this guide, you will learn how to consume messages from a kafka topic.
### Step 1: Get the Kafka API
```python
-
import hopsworks
project = hopsworks.login()
kafka_api = project.get_kafka_api()
-
```
### Step 2: Configure confluent-kafka client
```python
-
consumer_config = kafka_api.get_default_config()
-consumer_config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
+consumer_config["default.topic.config"] = {"auto.offset.reset": "earliest"}
from confluent_kafka import Consumer
consumer = Consumer(consumer_config)
-
```
### Step 3: Consume messages from a topic
```python
-
# Subscribe to topic
consumer.subscribe(["my_topic"])
for i in range(0, 10):
msg = consumer.poll(timeout=10.0)
print(msg.value())
-
```
### API Reference
diff --git a/docs/user_guides/projects/kafka/create_schema.md b/docs/user_guides/projects/kafka/create_schema.md
index cebd9fe30..e36e070c0 100644
--- a/docs/user_guides/projects/kafka/create_schema.md
+++ b/docs/user_guides/projects/kafka/create_schema.md
@@ -9,13 +9,11 @@ In this guide, you will learn how to create a Kafka Avro Schema in the Hopsworks
### Step 1: Get the Kafka API
```python
-
import hopsworks
project = hopsworks.login()
kafka_api = project.get_kafka_api()
-
```
### Step 2: Define the schema
@@ -23,25 +21,14 @@ kafka_api = project.get_kafka_api()
Define the Avro Schema, see [types](https://avro.apache.org/docs/current/spec.html#schema_primitive) for the format of the schema.
```python
-
schema = {
"type": "record",
"name": "tutorial",
"fields": [
- {
- "name": "id",
- "type": "int"
- },
- {
- "name": "data",
- "type": "string"
- }
- ]
+ {"name": "id", "type": "int"},
+ {"name": "data", "type": "string"},
+ ],
}
-
-
-
-
```
### Step 3: Create the schema
@@ -49,11 +36,9 @@ schema = {
Create the schema in the Schema Registry.
```python
-
-SCHEMA_NAME="schema_example"
+SCHEMA_NAME = "schema_example"
my_schema = kafka_api.create_schema(SCHEMA_NAME, schema)
-
```
### API Reference
diff --git a/docs/user_guides/projects/kafka/create_topic.md b/docs/user_guides/projects/kafka/create_topic.md
index 7ab420b4b..a6c3826a4 100644
--- a/docs/user_guides/projects/kafka/create_topic.md
+++ b/docs/user_guides/projects/kafka/create_topic.md
@@ -16,24 +16,22 @@ In this guide, you will learn how to create a Kafka Topic.
### Step 1: Get the Kafka API
```python
-
import hopsworks
project = hopsworks.login()
kafka_api = project.get_kafka_api()
-
```
### Step 2: Define the schema
```python
+TOPIC_NAME = "topic_example"
+SCHEMA_NAME = "schema_example"
-TOPIC_NAME="topic_example"
-SCHEMA_NAME="schema_example"
-
-my_topic = kafka_api.create_topic(TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1)
-
+my_topic = kafka_api.create_topic(
+ TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1
+)
```
### API Reference
diff --git a/docs/user_guides/projects/kafka/produce_messages.md b/docs/user_guides/projects/kafka/produce_messages.md
index a44a3804c..4f97630d2 100644
--- a/docs/user_guides/projects/kafka/produce_messages.md
+++ b/docs/user_guides/projects/kafka/produce_messages.md
@@ -16,41 +16,37 @@ In this guide, you will learn how to produce messages to a kafka topic.
### Step 1: Get the Kafka API
```python
-
import hopsworks
project = hopsworks.login()
kafka_api = project.get_kafka_api()
-
```
### Step 2: Configure confluent-kafka client
```python
-
producer_config = kafka_api.get_default_config()
from confluent_kafka import Producer
producer = Producer(producer_config)
-
```
### Step 3: Produce messages to topic
```python
-
-import uuid
import json
+import uuid
# Send a few messages
for i in range(0, 10):
- producer.produce("my_topic", json.dumps({"id": i, "data": str(uuid.uuid1())}), "key")
+ producer.produce(
+ "my_topic", json.dumps({"id": i, "data": str(uuid.uuid1())}), "key"
+ )
# Trigger the sending of all messages to the brokers, 10 sec timeout
producer.flush(10)
-
```
### API Reference
diff --git a/docs/user_guides/projects/opensearch/connect.md b/docs/user_guides/projects/opensearch/connect.md
index 00d701f08..481ac9743 100644
--- a/docs/user_guides/projects/opensearch/connect.md
+++ b/docs/user_guides/projects/opensearch/connect.md
@@ -14,23 +14,19 @@ In this guide, you will learn how to connect to the OpenSearch cluster using an
### Step 1: Get the OpenSearch API
```python
-
import hopsworks
project = hopsworks.login()
opensearch_api = project.get_opensearch_api()
-
```
### Step 2: Configure the opensearch-py client
```python
-
from opensearchpy import OpenSearch
client = OpenSearch(**opensearch_api.get_default_py_config())
-
```
### API Reference
diff --git a/docs/user_guides/projects/opensearch/knn.md b/docs/user_guides/projects/opensearch/knn.md
index 442eeb758..30f54e7f3 100644
--- a/docs/user_guides/projects/opensearch/knn.md
+++ b/docs/user_guides/projects/opensearch/knn.md
@@ -51,13 +51,8 @@ Create an index to use by calling `opensearch_api.get_project_index(..)`.
"knn.algo_param.ef_search": 100,
},
"mappings": {
- "properties": {
- "my_vector1": {
- "type": "knn_vector",
- "dimension": 2
- }
- }
- }
+ "properties": {"my_vector1": {"type": "knn_vector", "dimension": 2}}
+ },
}
response = client.indices.create(knn_index_name, body=index_body)
@@ -73,16 +68,17 @@ These vectors represent the list of vectors to calculate the similarity for.
=== "Python"
```python
- from opensearchpy.helpers import bulk
import random
+ from opensearchpy.helpers import bulk
+
actions = [
{
"_index": knn_index_name,
"_id": count,
"_source": {
"my_vector1": [random.uniform(0, 10), random.uniform(0, 10)],
- }
+ },
}
for count in range(0, 10)
]
@@ -103,24 +99,15 @@ Score the vector `[2.5, 3]` and find the 3 most similar vectors.
# Define the search request
query = {
"size": 3,
- "query": {
- "knn": {
- "my_vector1": {
- "vector": [2.5, 3],
- "k": 3
- }
- }
- }
+ "query": {"knn": {"my_vector1": {"vector": [2.5, 3], "k": 3}}},
}
# Perform the similarity search
- response = client.search(
- body = query,
- index = knn_index_name
- )
+ response = client.search(body=query, index=knn_index_name)
# Pretty print response
import pprint
+
pp = pprint.PrettyPrinter()
pp.pprint(response)
```
diff --git a/docs/user_guides/projects/python/custom_commands.md b/docs/user_guides/projects/python/custom_commands.md
index 892b5995a..9d9f151f3 100644
--- a/docs/user_guides/projects/python/custom_commands.md
+++ b/docs/user_guides/projects/python/custom_commands.md
@@ -44,12 +44,11 @@ You can also run the custom commands using the REST API.
From the REST API, you should provide the path, in HOPSFS, to the bash script and the artifacts(comma separated string of paths in HopsFs).
The REST API endpoint for running custom commands is: `hopsworks-api/api/project//python/environments//commands/custom` and the body should look like this:
-```python
+```json
{
"commandsFile": "",
"artifacts": ""
}
-
```
## What to include in the bash script
From c9d22a7c660d0241f456d96faf7527540e655d34 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Wed, 25 Feb 2026 13:30:19 +0100
Subject: [PATCH 09/16] Formatting
---
docs/user_guides/fs/feature_monitoring/index.md | 3 ++-
.../projects/scheduling/kube_scheduler.md | 16 ++++++++--------
2 files changed, 10 insertions(+), 9 deletions(-)
diff --git a/docs/user_guides/fs/feature_monitoring/index.md b/docs/user_guides/fs/feature_monitoring/index.md
index 698293831..49757e518 100644
--- a/docs/user_guides/fs/feature_monitoring/index.md
+++ b/docs/user_guides/fs/feature_monitoring/index.md
@@ -17,7 +17,8 @@ Hopsworks feature monitoring user interface is centered around two functionaliti
!!! important
To enable feature monitoring in Hopsworks, you need to set the `enable_feature_monitoring` [configuration option](../../../setup_installation/admin/variables.md) to `true`.
This can also be achieved in the cluster definition by setting the following attribute:
- ```
+
+ ```yaml
hopsworks:
enable_feature_monitoring: "true"
```
diff --git a/docs/user_guides/projects/scheduling/kube_scheduler.md b/docs/user_guides/projects/scheduling/kube_scheduler.md
index df352ddfa..fa1db20b9 100644
--- a/docs/user_guides/projects/scheduling/kube_scheduler.md
+++ b/docs/user_guides/projects/scheduling/kube_scheduler.md
@@ -92,18 +92,18 @@ This can be done from the `Available in Hopsworks` sub-section.
In order to be able to list all the Kubernetes Node Labels, Hopsworks requires the following cluster role:
- ```
- - apiGroups: [""]
- resources: ["nodes"]
- verbs: ["get", "list"]
+ ```yaml
+ - apiGroups: [""]
+ resources: ["nodes"]
+ verbs: ["get", "list"]
```
In order to be able to list all the Kubernetes Cluster Priority Classes, Hopsworsk requires this cluster role:
- ```
- - apiGroups: ["scheduling.k8s.io"]
- resources: ["priorityclasses"]
- verbs: ["get", "list"]
+ ```yaml
+ - apiGroups: ["scheduling.k8s.io"]
+ resources: ["priorityclasses"]
+ verbs: ["get", "list"]
```
If the roles above are configured properly (default behaviour), admins can only select values from the drop down menu.
From b749adbc69d5990bde5214d59d24c99688358f5e Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Thu, 26 Feb 2026 15:02:31 +0100
Subject: [PATCH 10/16] Add breaks to TOCs
---
docs/css/custom.css | 6 ++++++
docs/templates/python/material/attribute.html.jinja | 10 ++++++++--
docs/templates/python/material/class.html.jinja | 10 ++++++++--
docs/templates/python/material/function.html.jinja | 10 ++++++++--
docs/templates/python/material/module.html.jinja | 10 ++++++++--
5 files changed, 38 insertions(+), 8 deletions(-)
diff --git a/docs/css/custom.css b/docs/css/custom.css
index 21076b2f8..785a61037 100644
--- a/docs/css/custom.css
+++ b/docs/css/custom.css
@@ -211,6 +211,12 @@ header.md-header {
content: "[source]";
}
+/*******************************************************/
+/* Handle overflow in sidebars. */
+.md-ellipsis {
+ overflow-wrap: break-word;
+}
+
/*******************************************************/
/* Custom styles for syntax highlighting in signatures. */
diff --git a/docs/templates/python/material/attribute.html.jinja b/docs/templates/python/material/attribute.html.jinja
index 76b3f95bf..116eea37a 100644
--- a/docs/templates/python/material/attribute.html.jinja
+++ b/docs/templates/python/material/attribute.html.jinja
@@ -33,13 +33,19 @@ Context:
{% set attribute_name = attribute.path if show_full_path else attribute.name %}
+ {% set label = config.toc_label if config.toc_label and root else attribute.name %}
+ {% if label | upper != label %}
+ {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %}
+ {% endif %}
+ {% set label = label.replace("_", "\u200b_") %}
+
{% if not root or config.show_root_heading %}
{% filter heading(
heading_level,
role="data" if attribute.parent.kind.value == "module" else "attr",
id=html_id,
class="doc doc-heading",
- toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else attribute.name),
+ toc_label=(' '|safe if config.show_symbol_type_toc else '') + label,
skip_inventory=config.skip_local_inventory,
) %}
@@ -97,7 +103,7 @@ Context:
{% filter heading(heading_level,
role="data" if attribute.parent.kind.value == "module" else "attr",
id=html_id,
- toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else attribute_name),
+ toc_label=(' '|safe if config.show_symbol_type_toc else '') + label,
hidden=True,
skip_inventory=config.skip_local_inventory,
) %}
diff --git a/docs/templates/python/material/class.html.jinja b/docs/templates/python/material/class.html.jinja
index e0d0834c1..5ce769f0b 100644
--- a/docs/templates/python/material/class.html.jinja
+++ b/docs/templates/python/material/class.html.jinja
@@ -35,13 +35,19 @@ Context:
{% set class_name = class.path if show_full_path else class.name %}
+ {% set label = config.toc_label if config.toc_label and root else class.name %}
+ {% if label | upper != label %}
+ {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %}
+ {% endif %}
+ {% set label = label.replace("_", "\u200b_") %}
+
{% if not root or config.show_root_heading %}
{% filter heading(
heading_level,
role="class",
id=html_id,
class="doc doc-heading",
- toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else class.name),
+ toc_label=(' '|safe if config.show_symbol_type_toc else '') + label,
skip_inventory=config.skip_local_inventory,
) %}
@@ -143,7 +149,7 @@ Context:
{% filter heading(heading_level,
role="class",
id=html_id,
- toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else class.name),
+ toc_label=(' '|safe if config.show_symbol_type_toc else '') + label,
hidden=True,
skip_inventory=config.skip_local_inventory,
) %}
diff --git a/docs/templates/python/material/function.html.jinja b/docs/templates/python/material/function.html.jinja
index 16eae44e5..b83a06adf 100644
--- a/docs/templates/python/material/function.html.jinja
+++ b/docs/templates/python/material/function.html.jinja
@@ -38,13 +38,19 @@ Context:
{% set symbol_type = "method" if function.parent.is_class else "function" %}
{#- Symbol type: method when parent is a class, function otherwise. -#}
+ {% set label = config.toc_label if config.toc_label and root else function.name %}
+ {% if label | upper != label %}
+ {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %}
+ {% endif %}
+ {% set label = label.replace("_", "\u200b_") %}
+
{% if not root or config.show_root_heading %}
{% filter heading(
heading_level,
role="function",
id=html_id,
class="doc doc-heading",
- toc_label=((' ')|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else function.name),
+ toc_label=((' ')|safe if config.show_symbol_type_toc else '') + label,
skip_inventory=config.skip_local_inventory,
) %}
@@ -143,7 +149,7 @@ Context:
heading_level,
role="function",
id=html_id,
- toc_label=((' ')|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else function.name),
+ toc_label=((' ')|safe if config.show_symbol_type_toc else '') + label,
hidden=True,
skip_inventory=config.skip_local_inventory,
) %}
diff --git a/docs/templates/python/material/module.html.jinja b/docs/templates/python/material/module.html.jinja
index 4d7f0f571..39fc4be4e 100644
--- a/docs/templates/python/material/module.html.jinja
+++ b/docs/templates/python/material/module.html.jinja
@@ -32,13 +32,19 @@ Context:
{% set module_name = module.path if show_full_path else module.name %}
+ {% set label = config.toc_label if config.toc_label and root else module.name %}
+ {% if label | upper != label %}
+ {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %}
+ {% endif %}
+ {% set label = label.replace("_", "\u200b_") %}
+
{% if not root or config.show_root_heading %}
{% filter heading(
heading_level,
role="module",
id=html_id,
class="doc doc-heading",
- toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else module.name),
+ toc_label=(' '|safe if config.show_symbol_type_toc else '') + label,
skip_inventory=config.skip_local_inventory,
) %}
@@ -80,7 +86,7 @@ Context:
{% filter heading(heading_level,
role="module",
id=html_id,
- toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else module.name),
+ toc_label=(' '|safe if config.show_symbol_type_toc else '') + label,
hidden=True,
skip_inventory=config.skip_local_inventory,
) %}
From 42ae741db6a0150abbfd7fa2bdb19902d4b6336a Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Thu, 26 Feb 2026 15:04:18 +0100
Subject: [PATCH 11/16] Update apigen
---
requirements-docs.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements-docs.txt b/requirements-docs.txt
index 7947ecd52..089dda472 100644
--- a/requirements-docs.txt
+++ b/requirements-docs.txt
@@ -4,7 +4,7 @@ mike==2.1.3
markdown==3.9
pymdown-extensions==10.17.2
mkdocs-minify-plugin>=0.2.0
-hopsworks-apigen==1.0.2
+hopsworks-apigen==1.0.3
mkdocstrings[python]==1.0.3
mkdocstrings-python==2.0.2
mkdocs-autorefs==1.4.4
From f0665ea882441ebba94bf34199999afdabb24b81 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Thu, 26 Feb 2026 16:58:49 +0100
Subject: [PATCH 12/16] Add GX inventory
---
build_great_expectations_inv.py | 169 ++++++++++++++++++++++++++++++++
docs/great_expectations.inv | Bin 0 -> 9362 bytes
mkdocs.yml | 2 +
3 files changed, 171 insertions(+)
create mode 100644 build_great_expectations_inv.py
create mode 100644 docs/great_expectations.inv
diff --git a/build_great_expectations_inv.py b/build_great_expectations_inv.py
new file mode 100644
index 000000000..8e9b7e33a
--- /dev/null
+++ b/build_great_expectations_inv.py
@@ -0,0 +1,169 @@
+# The file is generated by Claude Code
+
+"""Build docs/great_expectations.inv for GX 0.18.
+
+GX 0.18 uses Docusaurus, not Sphinx, so no objects.inv is published.
+Internally they still run sphinx-build as an intermediate step (to generate
+MDX from HTML), but delete the output including objects.inv.
+
+This script reconstructs it by:
+ 1. Cloning the 0.18.x branch (if not already present at GX_CLONE_PATH)
+ 2. Generating the MD autodoc stubs using GX's own build logic, capturing
+ the sidebar_entries mapping: Sphinx HTML stem -> Docusaurus URL path
+ 3. Running sphinx-build to get a properly typed Sphinx objects.inv
+ 4. Remapping each entry's URI from the Sphinx HTML path to the Docusaurus URL
+ 5. Writing the result to docs/great_expectations.inv
+
+Usage:
+ uv run --python 3.11 --with 'great_expectations==0.18.22' \
+ --with 'sphinx~=5.3.0' --with 'pydata-sphinx-theme==0.11.0' \
+ --with 'myst-parser' --with 'docstring-parser==0.15' \
+ --with 'sphobjinv' --with 'invoke' --with 'beautifulsoup4' \
+ build_great_expectations_inv.py
+
+Or with a pre-created venv (see GX_VENV below):
+ python build_great_expectations_inv.py
+"""
+
+from __future__ import annotations
+
+import pathlib
+import subprocess
+import sys
+import tempfile
+
+# --- Configuration ---
+
+REPO_ROOT = pathlib.Path(__file__).parent
+OUT_INV = REPO_ROOT / "docs" / "great_expectations.inv"
+
+# Where to clone/find the GX 0.18.x source
+GX_CLONE_PATH = pathlib.Path(tempfile.gettempdir()) / "gx_0_18"
+
+# Sphinx binary (defaults to whatever is on PATH)
+SPHINX_BUILD = "sphinx-build"
+
+# Base URL of the published Docusaurus API docs for 0.18
+DOCUSAURUS_BASE = "https://docs.greatexpectations.io/docs/0.18/reference/api/"
+
+
+# --- Step 1: Ensure GX 0.18.x source is available ---
+
+if not GX_CLONE_PATH.exists():
+ print(f"Cloning GX 0.18.x into {GX_CLONE_PATH} ...")
+ subprocess.run(
+ [
+ "git", "clone", "--depth", "1", "--branch", "0.18.x",
+ "https://github.com/great-expectations/great_expectations.git",
+ str(GX_CLONE_PATH),
+ ],
+ check=True,
+ )
+else:
+ print(f"Using existing clone at {GX_CLONE_PATH}")
+
+sys.path.insert(0, str(GX_CLONE_PATH))
+
+# --- Step 2: Generate stubs and capture sidebar_entries ---
+
+from docs.sphinx_api_docs_source.build_sphinx_api_docs import ( # noqa: E402
+ SphinxInvokeDocsBuilder,
+ SidebarEntryType,
+)
+
+import invoke # noqa: E402
+
+api_source = GX_CLONE_PATH / "docs" / "sphinx_api_docs_source"
+ctx = invoke.Context()
+builder = SphinxInvokeDocsBuilder(
+ ctx=ctx,
+ api_docs_source_path=api_source,
+ repo_root=GX_CLONE_PATH,
+)
+
+print("Generating autodoc stubs ...")
+builder._build_class_md_stubs()
+builder._build_module_md_stubs()
+
+# Build the mapping: Sphinx HTML stem -> (py_domain_type, docusaurus_relative_url)
+stem_to_info: dict[str, tuple[str, str]] = {}
+
+for name, entry in builder.sidebar_entries.items():
+ doc_url = str(entry.mdx_relpath.with_suffix(""))
+ if entry.type == SidebarEntryType.CLASS:
+ stem_to_info[name] = ("py:class", doc_url)
+ else:
+ # Module: key is the flat path string, stem is the md_relpath stem
+ stem = entry.md_relpath.stem
+ stem_to_info[stem] = ("py:module", doc_url)
+
+print(f" {len(stem_to_info)} sidebar entries captured")
+
+# --- Step 3: Run sphinx-build ---
+
+sphinx_out = GX_CLONE_PATH / "temp_inv_build"
+sphinx_out.mkdir(exist_ok=True)
+
+print("Running sphinx-build ...")
+subprocess.run(
+ [SPHINX_BUILD, "-M", "html", str(api_source), str(sphinx_out), "-E", "-q"],
+ check=True,
+)
+inv_path = sphinx_out / "html" / "objects.inv"
+print(f" Sphinx objects.inv: {inv_path.stat().st_size} bytes")
+
+# --- Step 4: Remap entries ---
+
+import sphobjinv as soi # noqa: E402
+
+sphinx_inv = soi.Inventory(str(inv_path))
+print(f" Sphinx inventory: {len(sphinx_inv.objects)} objects")
+
+remapped: list[soi.DataObjStr] = []
+skipped = 0
+
+for obj in sphinx_inv.objects:
+ uri_raw = obj.uri
+ uri_path, _, fragment = uri_raw.partition("#")
+ stem = pathlib.Path(uri_path).stem
+
+ if stem not in stem_to_info:
+ skipped += 1
+ continue
+
+ _, doc_url = stem_to_info[stem]
+
+ # "$" means "use the object name as the anchor"
+ resolved_fragment = obj.name if fragment == "$" else fragment
+ full_uri = f"{doc_url}#{resolved_fragment}" if resolved_fragment else doc_url
+
+ remapped.append(
+ soi.DataObjStr(
+ name=obj.name,
+ domain=obj.domain,
+ role=obj.role,
+ priority=str(obj.priority),
+ uri=full_uri,
+ dispname=obj.dispname or "-",
+ )
+ )
+
+print(f" Remapped: {len(remapped)}, skipped (index/search pages): {skipped}")
+
+# --- Step 5: Write inventory ---
+
+new_inv = soi.Inventory()
+new_inv.project = "great_expectations"
+new_inv.version = "0.18"
+new_inv.objects = remapped
+
+soi.writebytes(str(OUT_INV), soi.compress(new_inv.data_file()))
+print(f"Written {len(remapped)} entries to {OUT_INV}")
+
+# --- Cleanup ---
+
+builder._remove_md_stubs()
+import shutil # noqa: E402
+shutil.rmtree(sphinx_out, ignore_errors=True)
+
+print("Done.")
diff --git a/docs/great_expectations.inv b/docs/great_expectations.inv
new file mode 100644
index 0000000000000000000000000000000000000000..7b2474cf6453fdc1b74efa21d32a7df510225545
GIT binary patch
literal 9362
zcmV;DByHOxAX9K?X>NERX>N99Zgg*Qc_4OWa&u{KZXhxWBOp+6Z)#;@bUGksa%Ew3
zUuAf3Wn*+V>iATTa5I0_>mRA^-&a%F8{X>Md?av*PJAarPH
zb0B7EY-J#6b0A}HZE$jBb8}^6Aa!$TZf78RY-wUH3V7P(T}^Z2wzA#lSER~oO7$wc
z>}-!`o~yAc^l)3;bWce^aojLn~_Pv`aD`Yk`ORlJRhINhrHoLcpJ-@NVEqKNqJ?DTOy
z@BA|7JN3~X*!Mll3yI!UvS-_3-fem8e*MfAFS|6}_Bt<)pUzJW1bs^Pui~}zN)_)r
zmY3eh*_`hq0DEBhK9SLApiUozS>1ZhcR%4K>3(T4ocM+(gfvN_UC!#y`~Jz;cFD3i
z*P`2Vzu8`{*BM(!1%qB|d3TwGJ62@z;yz1XXBdm
zm!#`<5^PJ8X(ov!hpE#vn!_-qRQ@f0B(Oe!ks17O_3_c(-T*~r?qi&=`>1%fHz`C?
zna$txf^BFF3$SFSKV1{}mj{s!hss$Xy|r($2Mv9jy0p|-!Zac8EHBML6p!yra~GJu`y&Cq{a}NvKW$j9Jxt!
z`eTA5Hj7Jrln`P=sI*6gATfhVd6W=GL-689UfwhxOK)7SO&l+&wYAft+j1`G%^`nC
z%Oqg?wTdx=`$Y@6teb>Q92)fE@ILOhbwZ&;T(ymQn7y7H>V1?Iag=nMJnmPk_zh@7
zfupx*ZE9e{xHjhMV%b=U7g?I8tKut*Hu;ydh*$BVhEk@1x^5l(GWp)FZxUAB{BN=>
z)l>hzXW9Fpuoobmtdw`Nc!6D2k@lA06Rc9G0FPGx*yU@sjoD>c-W1BqA!vC-xsTQ?
zTkeZ@J&dSn2WPvC^7@8)m84HqK2bI|tyXlsIWvcQsX27N2M?!%;+t{(A$5tc2Zz_c
zjhc%T_2ZyFrRh3h^-?bC5~=!grkb0kIdr%E2`bHtby=(%Lk-$@pxQFx&7|}=wdMit
zSd^FJ6adop;DNqIPYDF3Zhr*%vkHTC)}Z!C{yhmhpPst)k)TnL?{q|aU~BeP?HnyZ
z>)2V=)XY}hB;7i&+uLQyvdjoKJ&&-DzhNtJanBR*mq+9DOW5{=WWI36#xCm2nXj@1NJlH*cBQ6z@O?-%<3QB(gp;%ExL8)2
zJLExKtTtPk@AHa4r|~I|lEED9oQ95)E8k4ezZrVstW1!*l(kkfJL6-Id@e`h4fd0+
z)7sHWfY;(q_7J7l|=iOD?a{-F=lj{QT{6CEC6;-`Uj@@N5c4aB?z>s|-0
zrGu8uqky|+$rH7RCIyrI0nBq3#n~ym5^xm*s{Sb947|jAd*uZzFLIn0{lU-I(oKWI
z55JTHBPzN>JD;6`(ya4nKxRE{HMXXHBZik8@SiWRL3j1?3&mHUVZnmJIiDkX8DYE|k2
zX9>wr=}3B-X3o?sDoC7avZ!!jXW5du(%J6mn>)Y1=A!VoV9n(Mj1Q-zbtDBZb9YIW
zmlS?eEiYZ*En2YEy3`G2BPR=2fW+RFuK-;jY+#6~97)Q~$aRCoD6!kbi%}OqY$%yo
zsPGjDrN#o7mvdSstfEEpk+3KYyj7(K8s=qTFu}U;ph8($1*4T#2YTunCK$1ZIWW|e
z3c;wg!jYsd9S=s&$B$G^ligr)*=|pQBxMXn${Bmybop5@nFQ@XTb=O)Bc?nDhVsNB
z7$LLJEMW(p*0Ae+xY^*1|JmTc0C5%uJ22pahzpgdSrX?xlrXPfliQY2C6}dBqexaW
zrS-?_aVWR#y1TOz^op9VW8a*GLkPW4q~Sk#3A~IK#k8dNn2Gh4ce$DL_1kWM)4zUu
z0mg?@nypz;&hy2B<$3t>Bg)p?XmdbQWD~($K1^J2w!HMScET|COplg~ST=8UD+Xrg
zt?mnBJaf~tXu0e?Q=;+@hReVJfZHnG_WI`Z?1m-){4%ylUzuC?32Xou=FjPmUc2z=
zz58}0JzuRw0DI#QgJKE~DdK|^bdy(n@WBPz9-b=Dp~S=y4u|*}&lcxlymRT8n!2?A
z8V&D6!D}=ZW=Mt>HLgM1HK{2v5duGvcC>Le1!7?+e_BPHu
z+|!+79Th(3k9E9&Ap@D1yT*LU4Fd@KYsgi{q^>bFW?L8-3=nr^q#r?x1Yb>JluQ;ALQul+Zp+zBkD>$)pGE1O`8Dev4#uZB>chb*d?jxffj0Z
zwG|<|?y+ez!Pl2|uc+BqonBz9pt#i!!Y+F60Uh1wjw^2!PrYumOc!~W$J}$&j_Grj
z?Ig4(i+ZNtTE$ZzJT&4{!O%EfNScNF`W3&C=7P{Zg`9B)m(V_ioKXjyiUwX>UcCeB
zQ$}MDNBC(UC!Nsei#Bf-M}El?Hp&bi8nzNJ&OJ|T+**j=;50OBC1A90B(-lP$7?9+
z*~HPVUKxyDqd2LLoj!b5Zu#NxJ!*$SG%WDa>L-xH>dji8QaV=g)cwa;&MKa||DeQK
z#Zw8y$P
zuiwPRt%dkam8e#g#$HY$UPo%-c3JvQ*~|0n78}P2d5zhR;cIfflt`@WrPN<>X0qeM
zH#8(Wz=u-;?UoSH#O;$vQFNh(nzeC+5h4I$3ng((ND%_TwnQi;AU8uaSiVOPat}m<
zHPWcE+xiefY8D$Q*_*C04mulecTll~Z*&YG7F7oij&1EA7y|ZTsK{b=U}PY^Z=%?Z
z7Zs4NK#19Ik%5d5`Yg6tWLTcU9&mSM&>y?CLdD;DQ$>hqvZF$&0JN7PMDQ6T!CNRo
z1fSsrymKN%FxYLNBn;VS7y!Ul>8WTV*XQLp+QoTt=H|=t0}Ru}Ybws*W$OU~bj_NI
zIeN8vfMmW@O~oCtNIk$9p%gr~tXSQKkl$b0ziy9j;7TUqlGk(v2m@fQH11XAWbabHr93quSx)|w7fpMUe
zq5vYL3lW%bDFuTqu)C0hTxd7I8*7vy7u^l;Mjm-~3-D4{&1#ejC8kSKLIC3hd1~hH
zg?S-N`Nvn56=9Kvt2kkY#{P832q5(Umh!2UV3zb%Kx5tBvCI29FyG-9v(&Ew8ZqEn
zEc>hQJO^9g7Jv`~yA43C#d(Ye0%JM&Jc+6G&0U2vguK11ZXg0Sf!R
zg%r$S3&e#6$X19D#QdELYWj$`F621I8#+!@z&Cc(SlZ1UXDZA$dE_|YmoJ>C!M}bX
z#5CQ)qSgZ5$`YbLx3j2KptrSzXy#j6)H)DbTtb`?m#xUwtAJZxjseeIqWqk*&6k
zzsyFxz6RVDb0i16JthQ$Zjn)oL2s3Pj1y+EOUAV`ZZfL2Y{0-X^iTND3T3F%PqCB*E|=
z0r-oD;JqFZfWKe@-rNxZcn>4k
z-5e2)!A1@pX~^~s-`{-ahK4UHV^(-ku5#Pu&VeJ9ex0
z3p;FOg@D($g3;gK>itqbC2qE*7L2r~)_h0QQR2K_|F`p<)O}AL|?WiR3<{3%h5n;p-iP7XkcbnrT0Gc3W~0<=F@q<
zk(#d<7e)iRTu((4y-?qFZzJKh{9~F)eA#Pb#IcVG;(&CdB}WQ3D#s)6A_K;u^%z7`
zvvt~iv|UEI)^&U{BbEbf9hpPmYR5y97VI;&ysQ*m{{8qV**6Ju`CpQ*O(sqsa}L8a
zl4-}c=3eWAyHL%_fhd|KP-eu1P&21bfDVF5b^32_d}&w5TF?>0xTlOh*&z&?rg|-
z^^MzM^XZ!4p2LF>IAg$zR;H7o;lT}V2gBoU_G5v=An*atid3VMf#BQ+j)%ebgkT(S
zAQVpEdC|&rGZfsrfp{=jPb;gj9MWB
z0u?kD*QHA0OSw3%U$!XGk33u_`@_o+%{oO$^mJD6Ge4d+bY)f5hxjLsg7bgmXyB9O
z(<%?q&)Nj%{;;Ze3AbaA~(~6bKXrK`r&&LVz-ln^WIP9el-&5`aKJdlQi_(#QJG%^Tusa0?93L
zTcW|wx+luJP?S>|Z}kP|z1PS2LX=CIZgd6bywk<`K^jOP+daY0u-_waj5v3M?Q{e`
z%T@>HA3@G&xUL_Z@4`OkgJG^{wz3_Z>(Vx_f0!d0uI~otyTF?@3u_^0T{D>M!se>X
zH@3y>sycAEl8Tw!{%&y^VbbMV2r9hn&od7*RjC4xx2x{)n>1ogAAI
zM~Vk6bR5e=7d=u$cBPkxNJaQ}!St`l#UdOTCrAFfWxhD;Zt{PQQ%
zrX1AbLd!zWve9Lt78zbHdX{ccKI*YSWu$N6xUy1<3o9=@i$<24dR$o9>05QNaWlK&
z<)~-n7UiiH8(5}#R*o%OwZO3Q)w613IjhBmm$jaSTa>waY-qXbTRXb^)k4F{V9&ad
zWw9O?Rv!CSjVzb-z`(NEw{C11twnyDa_)jGubX25%**L+d$@|8H8f8|c}z#!ru-e?
zK+9cX8R&9$fCMjJiKSSSr$a1IIZ7=AS8fh)VC5yT2xR#<#DSHI)GCS%&Fp}ef5b{G
z%DVv;u$&`Sf-TnuK(O+RSOv2D8sNaoEn*=S<21t=A-udu^P+5+8=|F0&LBw|9yD@5i(~%O;g-!zK$k2r%RB4%^^in0CzD{VC63@+v
z%}59Ha7wceIp^?)%4oZTwNV@?qXmK|>)te4B=1^SV7lj5GdAhd1LKl=&+09ceSm>!
zj{@r=0AK=^QjFyYWU>LU07^f`asw_g0T=+MDr0#9pTq!+z|xwrT;U+$K@=N))w_xZ
z6OICo=KjQIyu*1u<=Ks#)8Pi~H4HS_C
z)2gAGPb1DEqPa8jj3JySW6lo5ocNx^h514_*BZo_w}fN}eZ88HGWx}85f0$3YHA6H
z_o<(vV(}bitW=*Dv#aX2hvt{lhzUgGyG$tr(gagFVRhFY
z-P0qlIKC;E()msK;h>N%d*!Jy!L*Kz$*c5D6V{E7FbU4%43kDPFAyfk>!au`VeTHJ
ze8n@E);pe!0`dLF-~shux=*M#%5?W0z;i#r`8@d3Xz$;56q^IVban^0to+Zal)5}r
zK=QJ)-)(;C1|coCZA-pAy0R>^3YLY7G}&*q;cHa((iC=Xbt!1kDlVI_BHplS6{VPv
z}^KIn`{R_eqg$
z^FFt_$?`s#1$f@)RIo+fr-H=hea^L7=Y3}2xV+D!dds}8W5hb|qgHK~_r*njp;P)8
zM=<(x99#CIOZ6(s4T$zz8S&$Qfwc6LfOG&<4!hlhjczR`zMT=X0l1JKdt-}z^<&7k
z>v+p%H-`^y>fgJmJKuKBwUyJ-T0wD}t*cp@Zg%@>nxva3-u5QDJH8^zcnJ+a86FmC
zw#;4GU-Xa*K!oCtdWsEaLHumz7CCkpVRNDc=XOYTIW##OTe`Vlg)#9aUqBsOgb
zgU6O55%|%iz=I4gm%4BR?0a&VES}ls9W=&dy&y`YmMxjLO9cTIdd?9q`56SplMt%u{^GR9A>!+
z!D&M_?PCH{+rVW{k!X_)4cP`obROKZEDw!_N^;JlXzTli0^9Zu;>!nQcNK*V
z)cwVm1MKcO3SijX$Cq5>9jQUYg=h*bwgH(Y4ihV}j=FTgmRcE6Xr+&+)Iy2lCsEj>
zR!I>gZDS*~CaXwz7wvGtu-_3agtKoi!6GU;kz^ekIXJuU=td$@kmOKl;-nkJ#L<#N
zw+NVSZjcZv_@CVy;{16tPzTi;2H(g(BlHH1yiB#|K!7KtsmeKAccS5IR|8al$$@&}p&@
z0Qfmir@($@TWuWC3$fZRx~tw{T41YhX>WuyITIp9sN~U$LW}fKj)@KHeiU1Wd^xo3
z4Zx2B$ZigD6R2B-AN6*(5jigGrs78{@`h6%P5=$ad?$pEfn7V=sHp2ehDD4WKw(3x
z3t85Ibbx3ZKV7IKi$o>Sw2gZ%RFcKL0WvD?xsYfV_XcpVxaUF;QQWgyWpJqryxvgK&B`mw
zKR|U?e|(>%)rmQoG|acX=gQuO`plR>6_xouO3R=hO2yE+Ie$2FTh2CY+mm*p!(|Jb
zKZLs$%5;DQXEee#(M}?>-LwE+KLOrcfd^C#Woze$e#uSX2o)fKCzO2cT+vPqx&IXo
z3ML+;6nb~%s0z_jB=~tBZn7-Z>k}f)1xP@t<_yhMH(w9NW7b(Y@Tv-t<^m+3YIBBq)n%H*
z-omFSXUniNue$}wL)D}@UTg_^T3C-fS0fvG#a@92FBz{&osHLk>s$8rAd6t<<3-eR
zn!Q=fn%ns})oS(5-n{V=-}!hHzF25)Q@^5kKK=u&P0|@NUBEm)H-eW!O-(}=7EN7e
zSJT_F*ZZeB*H?-E$O?E0s6(BrPz`-!$K0`JjMKi!dkUED+1Y}mdeiqDD#f!iiAnFK
z??|h(&d#)PN;iEk!c#drd-^nP`aT1sZzhJIS%!&&>@o}+)>5Bjakc&yCEe54Ki1y|
zf1mzf+10J+g!e}Jq3gVJ+PkEyS|Nl3g^>E|9Cp>GPjR@6-oohcMNhr4KZZUDHFDJ(
z7Uk79J5N-3pB}mTW^s0++PmaH%8#&bCNuTa8ZM(_3B&NNxw-(BZ4WflgGCJ6dd(id
zwQi4Q+E@eeT-Ae0AGL#X3EQn8(RdD~HK6-U2je=J=D_Z==Eiq0-T>caY4t_$vwdEF
zD`2~NR2ggklFh!Af4c7c<-YwxcYd~w=W>|23`VZXh8wlm_KTVFeavO-Q&Zj}D)d6e
zw(T<>NwmvZ2~v~;r+YP<7IUtJoUQ~BV0~7k;Xip=RbY#Len7*ekFO&3Q2)c0k7X{u
zU4LbpT~Zb}awjR5w(t)pdB%d$`-om|_nbO2*iY*3`_MR(-`nx-Glp
z@);a>nZg|=hLa{3ebxv#i$`DJdH6N|bbn|eGQo^Ev^o87Xsjisup_3oFK;@{3dr%+
z^H6?PuL&LQIS(w~C;h2A>^7HlQ&0xjwk2P$Mt3?3?)1JjNhnz>l?OH70u6$m0IkYz
zt6DwIi>RE6S%Xy-K9`3N%?6|pXf6PPsWs|McD)w}%bwX}ifxh6DeBZat&BiSqV@xM>CFe%BIwcEHu9
zkik@P$aae}%?|Gm^d7NlUn02XA1FJd9mn2tQSZ5+w_FN>36UcCyd77c!3;fto__l4
z@RRsToqjqtG$Ad*hf`|bB+DgUrsH-rgy5)8GMz96B#C;UW{mkE-P%GheP+?pJ;&M#
zf?dWo=_~8@aL-j>6Tv)>`WU{3u*5Dxf$;GmC!9ME`8110;l7BIU-M%5`!Zd0R?6y+
MtN;D-e
Date: Thu, 26 Feb 2026 17:14:39 +0100
Subject: [PATCH 13/16] Add polars patch inventory
---
build_polars_patch_inv.py | 50 +++++++++++++++++++++++++++++++++++++++
docs/polars_patch.inv | 5 ++++
mkdocs.yml | 4 +++-
3 files changed, 58 insertions(+), 1 deletion(-)
create mode 100644 build_polars_patch_inv.py
create mode 100644 docs/polars_patch.inv
diff --git a/build_polars_patch_inv.py b/build_polars_patch_inv.py
new file mode 100644
index 000000000..dab60e2e0
--- /dev/null
+++ b/build_polars_patch_inv.py
@@ -0,0 +1,50 @@
+# The file is generated by Claude Code
+
+"""Build docs/polars_patch.inv — a patch for polars' broken objects.inv.
+
+The polars Sphinx inventory is missing py:class entries for all major classes
+(DataFrame, LazyFrame, Series, Expr, Config, DataType). Their docs build
+generates per-method pages and never emits the class-level entry.
+
+This script constructs a small supplemental inventory with just those six
+entries, each pointing to the relevant class overview page.
+
+Usage:
+ uv run --with sphobjinv build_polars_patch_inv.py
+"""
+
+import pathlib
+import sphobjinv as soi
+
+OUT = pathlib.Path(__file__).parent / "docs" / "polars_patch.inv"
+
+# Mapping: fully-qualified name -> relative URL (from docs.pola.rs/api/python/stable/)
+# $ means "use the object name as the anchor" (standard Sphinx convention)
+MISSING_CLASSES = {
+ "polars.Config": "reference/config.html#$",
+ "polars.DataFrame": "reference/dataframe/index.html#$",
+ "polars.DataType": "reference/datatypes.html#$",
+ "polars.Expr": "reference/expressions/index.html#$",
+ "polars.LazyFrame": "reference/lazyframe/index.html#$",
+ "polars.Series": "reference/series/index.html#$",
+}
+
+objects = [
+ soi.DataObjStr(
+ name=name,
+ domain="py",
+ role="class",
+ priority="1",
+ uri=uri,
+ dispname="-",
+ )
+ for name, uri in MISSING_CLASSES.items()
+]
+
+inv = soi.Inventory()
+inv.project = "polars-patch"
+inv.version = ""
+inv.objects = objects
+
+soi.writebytes(str(OUT), soi.compress(inv.data_file()))
+print(f"Written {len(objects)} entries to {OUT}")
diff --git a/docs/polars_patch.inv b/docs/polars_patch.inv
new file mode 100644
index 000000000..3092aad1b
--- /dev/null
+++ b/docs/polars_patch.inv
@@ -0,0 +1,5 @@
+# Sphinx inventory version 2
+# Project: polars-patch
+# Version:
+# The remainder of this file is compressed using zlib.
+x}ν ݧ 6]nAI({O/"?#b0F=;9QӄK6K1W=džNʏ;ϑ@/6ʍˮBV
e{
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index b68b22c32..e5c98e094 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -372,8 +372,10 @@ plugins:
- https://docs.pydantic.dev/latest/objects.inv
- https://fastapi.tiangolo.com/objects.inv
- https://scikit-learn.org/stable/objects.inv
- - https://docs.pola.rs/api/python/stable/objects.inv
- https://arrow.apache.org/docs/objects.inv
+ - https://docs.pola.rs/api/python/stable/objects.inv
+ - url: file:./docs/polars_patch.inv
+ base_url: https://docs.pola.rs/api/python/stable/
- url: file:./docs/great_expectations.inv
base_url: https://docs.greatexpectations.io/docs/0.18/reference/api/
From 314ffb8613471e10825111af68e1daff2060fcb2 Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Fri, 27 Feb 2026 11:24:11 +0100
Subject: [PATCH 14/16] Check for mkdocs warnings
---
.github/workflows/mkdocs-test.yml | 3 +++
mkdocs.yml | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/.github/workflows/mkdocs-test.yml b/.github/workflows/mkdocs-test.yml
index c86687a2d..15f42bedf 100644
--- a/.github/workflows/mkdocs-test.yml
+++ b/.github/workflows/mkdocs-test.yml
@@ -61,6 +61,9 @@ jobs:
- name: Install Ubuntu dependencies
run: sudo apt update && sudo apt-get install -y libxml2-dev libxslt-dev
+ - name: Check mkdocs warnings
+ run: mkdocs build -s
+
- name: Check for broken links
run: |
# run the server
diff --git a/mkdocs.yml b/mkdocs.yml
index e5c98e094..30e798a8e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -364,6 +364,10 @@ plugins:
link_source: true
extensions:
- hopsworks_apigen.mkdocs
+ docstring_style: google
+ docstring_options:
+ ignore_init_summary: false
+ merge_init_into_class: false
inventories:
- https://docs.python.org/3/objects.inv
- https://pandas.pydata.org/docs/objects.inv
From 5a2a52e023a52aa6d7048361f8c1858fbed73c7e Mon Sep 17 00:00:00 2001
From: Aleksey Veresov
Date: Fri, 27 Feb 2026 12:14:54 +0100
Subject: [PATCH 15/16] Format examples
---
.github/workflows/mkdocs-test.yml | 28 +-
docs/assets/images/architecture.svg | 2842 ++++++++---------
.../projects/python/python_env_clone.md | 120 +-
.../projects/python/python_env_overview.md | 132 +-
4 files changed, 1565 insertions(+), 1557 deletions(-)
diff --git a/.github/workflows/mkdocs-test.yml b/.github/workflows/mkdocs-test.yml
index 15f42bedf..66b1f5726 100644
--- a/.github/workflows/mkdocs-test.yml
+++ b/.github/workflows/mkdocs-test.yml
@@ -24,6 +24,24 @@ jobs:
with:
globs: '**/*.md'
+ - uses: actions/setup-python@v5
+ with:
+ python-version: "3.10"
+
+ - name: Install uv
+ uses: astral-sh/setup-uv@v7
+ with:
+ activate-environment: true
+ working-directory: hopsworks-api/python
+
+ - name: Snakeoil (Python code blocks in markdown)
+ run: |
+ uv tool install md-snakeoil
+ snakeoil --line-length 88 --rules "E,F,B,C4,ISC,PIE,PYI,Q,RSE,RET,SIM,TC,I,W,D2,D3,D4,INP,UP,FA" docs
+ # Remove newlines added at the end of code blocks by snakeoil:
+ python3 -c 'import re,pathlib;sn=["python","py","Python","python3","py3"];inf="|".join(sn)+"| "+"| ".join(sn);p=rf"([ \t]*)(\`{{3}}(?:{inf})(?:[^\n]*)\n)(.*?)([ \t]*\`{{3}})";[f.write_text(re.sub(p,lambda m:m.group(1)+m.group(2)+(m.group(3)[:-1] if m.group(3).endswith("\n\n") else m.group(3))+m.group(4),f.read_text(),flags=re.DOTALL)) for f in pathlib.Path("docs").rglob("*.md")]'
+ git diff --exit-code
+
- name: Cache local Maven repository
uses: actions/cache@v4
with:
@@ -42,16 +60,6 @@ jobs:
working-directory: hopsworks-api/java
run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../../docs/javadoc
- - uses: actions/setup-python@v5
- with:
- python-version: "3.10"
-
- - name: Install uv
- uses: astral-sh/setup-uv@v7
- with:
- activate-environment: true
- working-directory: hopsworks-api/python
-
- name: Install Python API dependencies
run: uv sync --extra dev --project hopsworks-api/python
diff --git a/docs/assets/images/architecture.svg b/docs/assets/images/architecture.svg
index 573beb6d7..20946b8d7 100644
--- a/docs/assets/images/architecture.svg
+++ b/docs/assets/images/architecture.svg
@@ -1,1421 +1,1421 @@
-
-
-
+
+
+
diff --git a/docs/user_guides/projects/python/python_env_clone.md b/docs/user_guides/projects/python/python_env_clone.md
index 3ad5b1965..dd47e2684 100644
--- a/docs/user_guides/projects/python/python_env_clone.md
+++ b/docs/user_guides/projects/python/python_env_clone.md
@@ -1,60 +1,60 @@
-# How To Clone Python Environment
-
-## Introduction
-
-Cloning an environment in Hopsworks means creating a copy of one of the base environments.
-The base environments are immutable, meaning that it is required to clone an environment before you can make any change to it, such as installing your own libraries.
-This ensures that the project maintains a set of stable environments that are tested with the capabilities of the platform, meanwhile through cloning, allowing users to further customize an environment without affecting the base environments.
-
-In this guide, you will learn how to clone an environment.
-
-## Step 1: Select an environment
-
-Under the `Project settings` section you can find the `Python environment` setting.
-
-First select an environment, for example the `python-feature-pipeline`.
-
-