diff --git a/.github/workflows/mkdocs-test.yml b/.github/workflows/mkdocs-test.yml index c86687a2d3..96a67eb245 100644 --- a/.github/workflows/mkdocs-test.yml +++ b/.github/workflows/mkdocs-test.yml @@ -24,24 +24,6 @@ jobs: with: globs: '**/*.md' - - name: Cache local Maven repository - uses: actions/cache@v4 - with: - path: ~/.m2/repository - key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} - restore-keys: | - ${{ runner.os }}-maven- - - - name: Set up JDK 8 - uses: actions/setup-java@v5 - with: - java-version: "8" - distribution: "adopt" - - - name: Build javadoc documentation - working-directory: hopsworks-api/java - run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../../docs/javadoc - - uses: actions/setup-python@v5 with: python-version: "3.10" @@ -52,6 +34,14 @@ jobs: activate-environment: true working-directory: hopsworks-api/python + - name: Snakeoil (Python code blocks in markdown) + run: | + uv tool install md-snakeoil + snakeoil --line-length 88 --rules "E,F,B,C4,ISC,PIE,PYI,Q,RSE,RET,SIM,TC,I,W,D2,D3,D4,INP,UP,FA" docs + # Remove newlines added at the end of code blocks by snakeoil: + python3 -c 'import re,pathlib;sn=["python","py","Python","python3","py3"];inf="|".join(sn)+"| "+"| ".join(sn);p=rf"([ \t]*)(\`{{3}}(?:{inf})(?:[^\n]*)\n)(.*?)([ \t]*\`{{3}})";[f.write_text(re.sub(p,lambda m:m.group(1)+m.group(2)+(m.group(3)[:-1] if m.group(3).endswith("\n\n") else m.group(3))+m.group(4),f.read_text(),flags=re.DOTALL)) for f in pathlib.Path("docs").rglob("*.md")]' + git diff --exit-code + - name: Install Python API dependencies run: uv sync --extra dev --project hopsworks-api/python @@ -61,6 +51,27 @@ jobs: - name: Install Ubuntu dependencies run: sudo apt update && sudo apt-get install -y libxml2-dev libxslt-dev + - name: Check mkdocs warnings + run: touch docs/javadoc; mkdocs build -s; rm docs/javadoc + + - name: Cache local Maven repository + uses: actions/cache@v4 + with: + path: ~/.m2/repository + key: ${{ runner.os }}-maven-${{ hashFiles('java/pom.xml') }} + restore-keys: | + ${{ runner.os }}-maven- + + - name: Set up JDK 8 + uses: actions/setup-java@v5 + with: + java-version: "8" + distribution: "adopt" + + - name: Build javadoc documentation + working-directory: hopsworks-api/java + run: mvn clean install javadoc:javadoc javadoc:aggregate -DskipTests && cp -r target/site/apidocs ../../docs/javadoc + - name: Check for broken links run: | # run the server diff --git a/build_great_expectations_inv.py b/build_great_expectations_inv.py new file mode 100644 index 0000000000..8e9b7e33ab --- /dev/null +++ b/build_great_expectations_inv.py @@ -0,0 +1,169 @@ +# The file is generated by Claude Code + +"""Build docs/great_expectations.inv for GX 0.18. + +GX 0.18 uses Docusaurus, not Sphinx, so no objects.inv is published. +Internally they still run sphinx-build as an intermediate step (to generate +MDX from HTML), but delete the output including objects.inv. + +This script reconstructs it by: + 1. Cloning the 0.18.x branch (if not already present at GX_CLONE_PATH) + 2. Generating the MD autodoc stubs using GX's own build logic, capturing + the sidebar_entries mapping: Sphinx HTML stem -> Docusaurus URL path + 3. Running sphinx-build to get a properly typed Sphinx objects.inv + 4. Remapping each entry's URI from the Sphinx HTML path to the Docusaurus URL + 5. Writing the result to docs/great_expectations.inv + +Usage: + uv run --python 3.11 --with 'great_expectations==0.18.22' \ + --with 'sphinx~=5.3.0' --with 'pydata-sphinx-theme==0.11.0' \ + --with 'myst-parser' --with 'docstring-parser==0.15' \ + --with 'sphobjinv' --with 'invoke' --with 'beautifulsoup4' \ + build_great_expectations_inv.py + +Or with a pre-created venv (see GX_VENV below): + python build_great_expectations_inv.py +""" + +from __future__ import annotations + +import pathlib +import subprocess +import sys +import tempfile + +# --- Configuration --- + +REPO_ROOT = pathlib.Path(__file__).parent +OUT_INV = REPO_ROOT / "docs" / "great_expectations.inv" + +# Where to clone/find the GX 0.18.x source +GX_CLONE_PATH = pathlib.Path(tempfile.gettempdir()) / "gx_0_18" + +# Sphinx binary (defaults to whatever is on PATH) +SPHINX_BUILD = "sphinx-build" + +# Base URL of the published Docusaurus API docs for 0.18 +DOCUSAURUS_BASE = "https://docs.greatexpectations.io/docs/0.18/reference/api/" + + +# --- Step 1: Ensure GX 0.18.x source is available --- + +if not GX_CLONE_PATH.exists(): + print(f"Cloning GX 0.18.x into {GX_CLONE_PATH} ...") + subprocess.run( + [ + "git", "clone", "--depth", "1", "--branch", "0.18.x", + "https://github.com/great-expectations/great_expectations.git", + str(GX_CLONE_PATH), + ], + check=True, + ) +else: + print(f"Using existing clone at {GX_CLONE_PATH}") + +sys.path.insert(0, str(GX_CLONE_PATH)) + +# --- Step 2: Generate stubs and capture sidebar_entries --- + +from docs.sphinx_api_docs_source.build_sphinx_api_docs import ( # noqa: E402 + SphinxInvokeDocsBuilder, + SidebarEntryType, +) + +import invoke # noqa: E402 + +api_source = GX_CLONE_PATH / "docs" / "sphinx_api_docs_source" +ctx = invoke.Context() +builder = SphinxInvokeDocsBuilder( + ctx=ctx, + api_docs_source_path=api_source, + repo_root=GX_CLONE_PATH, +) + +print("Generating autodoc stubs ...") +builder._build_class_md_stubs() +builder._build_module_md_stubs() + +# Build the mapping: Sphinx HTML stem -> (py_domain_type, docusaurus_relative_url) +stem_to_info: dict[str, tuple[str, str]] = {} + +for name, entry in builder.sidebar_entries.items(): + doc_url = str(entry.mdx_relpath.with_suffix("")) + if entry.type == SidebarEntryType.CLASS: + stem_to_info[name] = ("py:class", doc_url) + else: + # Module: key is the flat path string, stem is the md_relpath stem + stem = entry.md_relpath.stem + stem_to_info[stem] = ("py:module", doc_url) + +print(f" {len(stem_to_info)} sidebar entries captured") + +# --- Step 3: Run sphinx-build --- + +sphinx_out = GX_CLONE_PATH / "temp_inv_build" +sphinx_out.mkdir(exist_ok=True) + +print("Running sphinx-build ...") +subprocess.run( + [SPHINX_BUILD, "-M", "html", str(api_source), str(sphinx_out), "-E", "-q"], + check=True, +) +inv_path = sphinx_out / "html" / "objects.inv" +print(f" Sphinx objects.inv: {inv_path.stat().st_size} bytes") + +# --- Step 4: Remap entries --- + +import sphobjinv as soi # noqa: E402 + +sphinx_inv = soi.Inventory(str(inv_path)) +print(f" Sphinx inventory: {len(sphinx_inv.objects)} objects") + +remapped: list[soi.DataObjStr] = [] +skipped = 0 + +for obj in sphinx_inv.objects: + uri_raw = obj.uri + uri_path, _, fragment = uri_raw.partition("#") + stem = pathlib.Path(uri_path).stem + + if stem not in stem_to_info: + skipped += 1 + continue + + _, doc_url = stem_to_info[stem] + + # "$" means "use the object name as the anchor" + resolved_fragment = obj.name if fragment == "$" else fragment + full_uri = f"{doc_url}#{resolved_fragment}" if resolved_fragment else doc_url + + remapped.append( + soi.DataObjStr( + name=obj.name, + domain=obj.domain, + role=obj.role, + priority=str(obj.priority), + uri=full_uri, + dispname=obj.dispname or "-", + ) + ) + +print(f" Remapped: {len(remapped)}, skipped (index/search pages): {skipped}") + +# --- Step 5: Write inventory --- + +new_inv = soi.Inventory() +new_inv.project = "great_expectations" +new_inv.version = "0.18" +new_inv.objects = remapped + +soi.writebytes(str(OUT_INV), soi.compress(new_inv.data_file())) +print(f"Written {len(remapped)} entries to {OUT_INV}") + +# --- Cleanup --- + +builder._remove_md_stubs() +import shutil # noqa: E402 +shutil.rmtree(sphinx_out, ignore_errors=True) + +print("Done.") diff --git a/build_polars_patch_inv.py b/build_polars_patch_inv.py new file mode 100644 index 0000000000..dab60e2e0b --- /dev/null +++ b/build_polars_patch_inv.py @@ -0,0 +1,50 @@ +# The file is generated by Claude Code + +"""Build docs/polars_patch.inv — a patch for polars' broken objects.inv. + +The polars Sphinx inventory is missing py:class entries for all major classes +(DataFrame, LazyFrame, Series, Expr, Config, DataType). Their docs build +generates per-method pages and never emits the class-level entry. + +This script constructs a small supplemental inventory with just those six +entries, each pointing to the relevant class overview page. + +Usage: + uv run --with sphobjinv build_polars_patch_inv.py +""" + +import pathlib +import sphobjinv as soi + +OUT = pathlib.Path(__file__).parent / "docs" / "polars_patch.inv" + +# Mapping: fully-qualified name -> relative URL (from docs.pola.rs/api/python/stable/) +# $ means "use the object name as the anchor" (standard Sphinx convention) +MISSING_CLASSES = { + "polars.Config": "reference/config.html#$", + "polars.DataFrame": "reference/dataframe/index.html#$", + "polars.DataType": "reference/datatypes.html#$", + "polars.Expr": "reference/expressions/index.html#$", + "polars.LazyFrame": "reference/lazyframe/index.html#$", + "polars.Series": "reference/series/index.html#$", +} + +objects = [ + soi.DataObjStr( + name=name, + domain="py", + role="class", + priority="1", + uri=uri, + dispname="-", + ) + for name, uri in MISSING_CLASSES.items() +] + +inv = soi.Inventory() +inv.project = "polars-patch" +inv.version = "" +inv.objects = objects + +soi.writebytes(str(OUT), soi.compress(inv.data_file())) +print(f"Written {len(objects)} entries to {OUT}") diff --git a/docs/assets/images/architecture.svg b/docs/assets/images/architecture.svg index 573beb6d7b..20946b8d78 100644 --- a/docs/assets/images/architecture.svg +++ b/docs/assets/images/architecture.svg @@ -1,1421 +1,1421 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/css/custom.css b/docs/css/custom.css index 26f7a273cf..785a610372 100644 --- a/docs/css/custom.css +++ b/docs/css/custom.css @@ -205,6 +205,18 @@ header.md-header { display: none; } +/*******************************************************/ +/* Handle source links the nice way. */ +.source-link::after { + content: "[source]"; +} + +/*******************************************************/ +/* Handle overflow in sidebars. */ +.md-ellipsis { + overflow-wrap: break-word; +} + /*******************************************************/ /* Custom styles for syntax highlighting in signatures. */ diff --git a/docs/great_expectations.inv b/docs/great_expectations.inv new file mode 100644 index 0000000000..7b2474cf64 Binary files /dev/null and b/docs/great_expectations.inv differ diff --git a/docs/polars_patch.inv b/docs/polars_patch.inv new file mode 100644 index 0000000000..3092aad1b4 --- /dev/null +++ b/docs/polars_patch.inv @@ -0,0 +1,5 @@ +# Sphinx inventory version 2 +# Project: polars-patch +# Version: +# The remainder of this file is compressed using zlib. +x}ν ݧ 6]nAI({O/"?#b0F=;9Q ӄK6K1W=džNʏ;ϑ@/6ʍˮBV e{ \ No newline at end of file diff --git a/docs/setup_installation/admin/roleChaining.md b/docs/setup_installation/admin/roleChaining.md index b796f0469b..e1e80a8990 100644 --- a/docs/setup_installation/admin/roleChaining.md +++ b/docs/setup_installation/admin/roleChaining.md @@ -29,13 +29,11 @@ For more details on how to create an IAM roles for Kubernetes service accounts s ```sh account_id=$(aws sts get-caller-identity --query "Account" --output text) oidc_provider=$(aws eks describe-cluster --name my-cluster --region $AWS_REGION --query "cluster.identity.oidc.issuer" --output text | sed -e "s/^https:\/\///") - ``` ```sh export namespace=hopsworks export service_account=my-service-account - ``` ```json diff --git a/docs/setup_installation/azure/getting_started.md b/docs/setup_installation/azure/getting_started.md index 2fa4dd09e6..7c617d46de 100644 --- a/docs/setup_installation/azure/getting_started.md +++ b/docs/setup_installation/azure/getting_started.md @@ -203,7 +203,6 @@ hopsfs: account: "STORAGE_ACCOUNT_NAME" container: "STORAGE_ACCOUNT_CONTAINER_NAME" identityClientId: "UA_IDENTITY_CLIENT_ID" - ``` ## Step 5: Deploy Hopsworks diff --git a/docs/templates/python/material/attribute.html.jinja b/docs/templates/python/material/attribute.html.jinja index f5d1fe083e..116eea37a1 100644 --- a/docs/templates/python/material/attribute.html.jinja +++ b/docs/templates/python/material/attribute.html.jinja @@ -33,13 +33,19 @@ Context: {% set attribute_name = attribute.path if show_full_path else attribute.name %} + {% set label = config.toc_label if config.toc_label and root else attribute.name %} + {% if label | upper != label %} + {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %} + {% endif %} + {% set label = label.replace("_", "\u200b_") %} + {% if not root or config.show_root_heading %} {% filter heading( heading_level, role="data" if attribute.parent.kind.value == "module" else "attr", id=html_id, class="doc doc-heading", - toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else attribute.name), + toc_label=(' '|safe if config.show_symbol_type_toc else '') + label, skip_inventory=config.skip_local_inventory, ) %} @@ -50,7 +56,7 @@ Context: -#} {% block source_link scoped %} {% if config.extra.link_source and attribute.source_link %} - [source] + {% endif %} {% endblock source_link %} @@ -97,7 +103,7 @@ Context: {% filter heading(heading_level, role="data" if attribute.parent.kind.value == "module" else "attr", id=html_id, - toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else attribute_name), + toc_label=(' '|safe if config.show_symbol_type_toc else '') + label, hidden=True, skip_inventory=config.skip_local_inventory, ) %} diff --git a/docs/templates/python/material/class.html.jinja b/docs/templates/python/material/class.html.jinja index ed20289083..5ce769f0bc 100644 --- a/docs/templates/python/material/class.html.jinja +++ b/docs/templates/python/material/class.html.jinja @@ -35,13 +35,19 @@ Context: {% set class_name = class.path if show_full_path else class.name %} + {% set label = config.toc_label if config.toc_label and root else class.name %} + {% if label | upper != label %} + {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %} + {% endif %} + {% set label = label.replace("_", "\u200b_") %} + {% if not root or config.show_root_heading %} {% filter heading( heading_level, role="class", id=html_id, class="doc doc-heading", - toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else class.name), + toc_label=(' '|safe if config.show_symbol_type_toc else '') + label, skip_inventory=config.skip_local_inventory, ) %} @@ -68,7 +74,7 @@ Context: {% block source_link scoped %} {% if config.extra.link_source and class.source_link %} - [source] + {% endif %} {% endblock source_link %} @@ -143,7 +149,7 @@ Context: {% filter heading(heading_level, role="class", id=html_id, - toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else class.name), + toc_label=(' '|safe if config.show_symbol_type_toc else '') + label, hidden=True, skip_inventory=config.skip_local_inventory, ) %} @@ -226,6 +232,23 @@ Context: {% endif %} {% endblock inheritance_diagram %} + {% block deprecation scoped %} + {% if class.extra.hopsworks_apigen and class.extra.hopsworks_apigen.deprecated %} +
+ Deprecated +

+ {%- set dep = class.extra.hopsworks_apigen.deprecated -%} + {%- set available_until = dep.available_until -%} + {%- set deprecated_by = dep.deprecated_by -%} + {%- set version = "version " + available_until if available_until else "a future release" -%} + + {{ class.name }} is deprecated and will be removed in {{ version }} of Hopsworks. + Instead of it, consider using {% for rec in deprecated_by %}{{ rec.split(".")[-2:] | join(".") }}{% if not loop.last %}, {% endif %}{% endfor %}. +

+
+ {% endif %} + {% endblock deprecation %} + {% block docstring scoped %} {#- Docstring block. @@ -277,13 +300,12 @@ Context: {% if class.extra.hopsworks_apigen and class.extra.hopsworks_apigen.aliases %} diff --git a/docs/templates/python/material/function.html.jinja b/docs/templates/python/material/function.html.jinja index b2f67aee97..b83a06adf7 100644 --- a/docs/templates/python/material/function.html.jinja +++ b/docs/templates/python/material/function.html.jinja @@ -38,13 +38,19 @@ Context: {% set symbol_type = "method" if function.parent.is_class else "function" %} {#- Symbol type: method when parent is a class, function otherwise. -#} + {% set label = config.toc_label if config.toc_label and root else function.name %} + {% if label | upper != label %} + {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %} + {% endif %} + {% set label = label.replace("_", "\u200b_") %} + {% if not root or config.show_root_heading %} {% filter heading( heading_level, role="function", id=html_id, class="doc doc-heading", - toc_label=((' ')|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else function.name), + toc_label=((' ')|safe if config.show_symbol_type_toc else '') + label, skip_inventory=config.skip_local_inventory, ) %} @@ -84,7 +90,7 @@ Context: {% block source_link scoped %} {% if config.extra.link_source and function.source_link %} - [source] + {% endif %} {% endblock source_link %} @@ -143,7 +149,7 @@ Context: heading_level, role="function", id=html_id, - toc_label=((' ')|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else function.name), + toc_label=((' ')|safe if config.show_symbol_type_toc else '') + label, hidden=True, skip_inventory=config.skip_local_inventory, ) %} @@ -160,6 +166,33 @@ Context: It contains other blocks that users can override. Overriding the contents block allows to rearrange the order of the blocks. -#} + {% block deprecation scoped %} + {% if function.extra.hopsworks_apigen and function.extra.hopsworks_apigen.deprecated %} +
+ Deprecated +

+ {%- set dep = function.extra.hopsworks_apigen.deprecated -%} + {%- set available_until = dep.available_until -%} + {%- set deprecated_by = dep.deprecated_by -%} + {%- set version = "version " + available_until if available_until else "a future release" -%} + + {{ function.name }} is deprecated and will be removed in {{ version }} of Hopsworks. + Instead of it, consider using {% for rec in deprecated_by %}{{ rec.split(".")[-2:] | join(".") }}{% if not loop.last %}, {% endif %}{% endfor %}. +

+
+ {% endif %} + {% endblock deprecation %} + + {% block docstring scoped %} + {#- Docstring block. + + This block renders the docstring for the function. + -#} + {% with docstring_sections = function.docstring.parsed %} + {% include "docstring.html.jinja" with context %} + {% endwith %} + {% endblock docstring %} + {% block aliases scoped %} {% if function.extra.hopsworks_apigen and function.extra.hopsworks_apigen.aliases %} {%- set public_aliases = function.extra.hopsworks_apigen.aliases | selectattr("is_public") | list -%} @@ -184,16 +217,6 @@ Context: {% endif %} {% endblock aliases %} - {% block docstring scoped %} - {#- Docstring block. - - This block renders the docstring for the function. - -#} - {% with docstring_sections = function.docstring.parsed %} - {% include "docstring.html.jinja" with context %} - {% endwith %} - {% endblock docstring %} - {% block source scoped %} {#- Source block. diff --git a/docs/templates/python/material/module.html.jinja b/docs/templates/python/material/module.html.jinja index 1fb1cb6950..39fc4be4e6 100644 --- a/docs/templates/python/material/module.html.jinja +++ b/docs/templates/python/material/module.html.jinja @@ -32,13 +32,19 @@ Context: {% set module_name = module.path if show_full_path else module.name %} + {% set label = config.toc_label if config.toc_label and root else module.name %} + {% if label | upper != label %} + {% set label = label.replace("A", "\u200bA").replace("B", "\u200bB").replace("C", "\u200bC").replace("D", "\u200bD").replace("E", "\u200bE").replace("F", "\u200bF").replace("G", "\u200bG").replace("H", "\u200bH").replace("I", "\u200bI").replace("J", "\u200bJ").replace("K", "\u200bK").replace("L", "\u200bL").replace("M", "\u200bM").replace("N", "\u200bN").replace("O", "\u200bO").replace("P", "\u200bP").replace("Q", "\u200bQ").replace("R", "\u200bR").replace("S", "\u200bS").replace("T", "\u200bT").replace("U", "\u200bU").replace("V", "\u200bV").replace("W", "\u200bW").replace("X", "\u200bX").replace("Y", "\u200bY").replace("Z", "\u200bZ") %} + {% endif %} + {% set label = label.replace("_", "\u200b_") %} + {% if not root or config.show_root_heading %} {% filter heading( heading_level, role="module", id=html_id, class="doc doc-heading", - toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else module.name), + toc_label=(' '|safe if config.show_symbol_type_toc else '') + label, skip_inventory=config.skip_local_inventory, ) %} @@ -49,7 +55,7 @@ Context: -#} {% block source_link scoped %} {% if config.extra.link_source and module.source_link %} - [source] + {% endif %} {% endblock source_link %} @@ -80,7 +86,7 @@ Context: {% filter heading(heading_level, role="module", id=html_id, - toc_label=(' '|safe if config.show_symbol_type_toc else '') + (config.toc_label if config.toc_label and root else module.name), + toc_label=(' '|safe if config.show_symbol_type_toc else '') + label, hidden=True, skip_inventory=config.skip_local_inventory, ) %} diff --git a/docs/user_guides/fs/data_source/usage.md b/docs/user_guides/fs/data_source/usage.md index 49e0491957..232d14b656 100644 --- a/docs/user_guides/fs/data_source/usage.md +++ b/docs/user_guides/fs/data_source/usage.md @@ -16,16 +16,19 @@ We will walk through each functionality in the sections below. We retrieve a data source simply by its unique name. === "PySpark" + ```python import hopsworks + # Connect to the Hopsworks feature store project = hopsworks.login() feature_store = project.get_feature_store() # Retrieve data source - ds = feature_store.get_data_source('data_source_name') + ds = feature_store.get_data_source("data_source_name") ``` === "Scala" + ```scala import com.logicalclocks.hsfs._ val connection = HopsworksConnection.builder().build(); @@ -46,12 +49,16 @@ The exact behaviour could change depending on the fdata source type, but broadly For data sources based on object/file storage such as AWS S3, ADLS, GCS, we set the full object path in the `path` argument and users should pass a Spark data format (parquet, csv, orc, hudi, delta) to the `data_format` argument. === "PySpark" + ```python # read data into dataframe using path - df = connector.read(data_format='data_format', path='fileScheme://bucket/path/') + df = connector.read( + data_format="data_format", path="fileScheme://bucket/path/" + ) ``` === "Scala" + ```scala // read data into dataframe using path val df = connector.read("", "data_format", new HashMap(), "fileScheme://bucket/path/") @@ -75,6 +82,7 @@ Using `prepare_spark` is also not necessary when using the `read` API. For example, to read directly from a S3 connector, we use the `prepare_spark` as follows: === "PySpark" + ```python connector.prepare_spark() spark.read.format("json").load("s3a://[bucket]/path") @@ -90,6 +98,7 @@ Depending on the connector type, users can also just set the table path and read This is mostly relevant for Google BigQuery. === "PySpark" + ```python # read results from a SQL df = connector.read(query="SELECT * FROM TABLE") @@ -98,6 +107,7 @@ This is mostly relevant for Google BigQuery. ``` === "Scala" + ```scala // read results from a SQL val df = connector.read("SELECT * FROM TABLE", "" , new HashMap(),"") @@ -110,7 +120,7 @@ For reading data streams, the Kafka Data Source supports reading a Kafka topic i === "PySpark" ```python - df = connector.read_stream(topic='kafka_topic_name') + df = connector.read_stream(topic="kafka_topic_name") ``` ## Creating an External Feature Group @@ -125,15 +135,17 @@ Depending on the external source, we should set either the `query` argument for Example for any data warehouse/SQL based external sources, we set the desired SQL to `query` argument, and set the `data_source` argument to the data source object of desired data source. === "PySpark" + ```python - ds.query="SELECT * FROM TABLE" + ds.query = "SELECT * FROM TABLE" - fg = feature_store.create_external_feature_group(name="sales", + fg = feature_store.create_external_feature_group( + name="sales", version=1, description="Physical shop sales features", - data_source = ds, - primary_key=['ss_store_sk'], - event_time='sale_date' + data_source=ds, + primary_key=["ss_store_sk"], + event_time="sale_date", ) ``` @@ -147,13 +159,14 @@ Data Sources are also used while writing training data to external sources. While calling the [Feature View](../../../concepts/fs/feature_view/fv_overview.md) API `create_training_data`, we can pass the `data_source` argument which is necessary to materialise the data to external sources, as shown below. === "PySpark" + ```python # materialise a training dataset version, job = feature_view.create_training_data( - description = 'describe training data', - data_format = 'spark_data_format', # e.g., data_format = "parquet" or data_format = "csv" - write_options = {"wait_for_job": False}, - data_source = ds + description="describe training data", + data_format="spark_data_format", # e.g., data_format = "parquet" or data_format = "csv" + write_options={"wait_for_job": False}, + data_source=ds, ) ``` diff --git a/docs/user_guides/fs/feature_group/create.md b/docs/user_guides/fs/feature_group/create.md index a6b27cc696..acbcea43c4 100644 --- a/docs/user_guides/fs/feature_group/create.md +++ b/docs/user_guides/fs/feature_group/create.md @@ -28,14 +28,15 @@ Using the HSFS API you can execute: === "PySpark" ```python - fg = feature_store.create_feature_group(name="weather", + fg = feature_store.create_feature_group( + name="weather", version=1, description="Weather Features", online_enabled=True, - primary_key=['location_id'], - partition_key=['day'], - event_time='event_time', - time_travel_format='DELTA', + primary_key=["location_id"], + partition_key=["day"], + event_time="event_time", + time_travel_format="DELTA", ) ``` @@ -121,12 +122,18 @@ The code example shows the creation of an online-enabled feature group that stor ```python fg = fs.create_feature_group( - name='air_quality', - description='Air Quality characteristics of each day', + name="air_quality", + description="Air Quality characteristics of each day", version=1, - primary_key=['city','date'], + primary_key=["city", "date"], online_enabled=True, - online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1', 'NDB_TABLE=PARTITION_BALANCE=FOR_RP_BY_LDM_X_2']} + online_config={ + "table_space": "ts_1", + "online_comments": [ + "NDB_TABLE=READ_BACKUP=1", + "NDB_TABLE=PARTITION_BALANCE=FOR_RP_BY_LDM_X_2", + ], + }, ) ``` @@ -150,29 +157,31 @@ For Python environments, only the stream API is supported (stream=True). === "Python" ```python - fg = feature_store.create_feature_group(name="weather", + fg = feature_store.create_feature_group( + name="weather", version=1, description="Weather Features", online_enabled=True, - primary_key=['location_id'], - partition_key=['day'], - event_time='event_time', - time_travel_format='HUDI', + primary_key=["location_id"], + partition_key=["day"], + event_time="event_time", + time_travel_format="HUDI", ) ``` === "PySpark" ```python - fg = feature_store.create_feature_group(name="weather", + fg = feature_store.create_feature_group( + name="weather", version=1, description="Weather Features", online_enabled=True, - primary_key=['location_id'], - partition_key=['day'], - event_time='event_time', - time_travel_format='HUDI', - stream=True + primary_key=["location_id"], + partition_key=["day"], + event_time="event_time", + time_travel_format="HUDI", + stream=True, ) ``` @@ -221,10 +230,7 @@ For example, most commonly, filtering is done on the event time column of a feat query = fg.select_all() # create a simple feature view -fv = fs.create_feature_view( - name='transactions_view', - query=query -) +fv = fs.create_feature_view(name="transactions_view", query=query) # set up dates start_time = "2022-01-01" @@ -234,7 +240,7 @@ end_time = "2022-06-30" version, job = fv.create_training_data( start_time=start_time, end_time=end_time, - description='Description of a dataset', + description="Description of a dataset", ) ``` @@ -280,9 +286,9 @@ For example, the inserted dataframe (unique combination of partition key values) !!! example "Default Hudi partitioning" ```python write_options = { - 'hoodie.bulkinsert.shuffle.parallelism': 5, - 'hoodie.insert.shuffle.parallelism': 5, - 'hoodie.upsert.shuffle.parallelism': 5 + "hoodie.bulkinsert.shuffle.parallelism": 5, + "hoodie.insert.shuffle.parallelism": 5, + "hoodie.upsert.shuffle.parallelism": 5, } ``` That means, using Spark, Hudi shuffles the data into five in-memory partitions, which each fill map to a task and finally a parquet file (see figure below). @@ -305,9 +311,9 @@ If the inserted Dataframe contains multiple feature group partitions, the parque You can change the write options on every insert, depending also on the size of the data you are writing: ```python write_options = { - 'hoodie.bulkinsert.shuffle.parallelism': 5, - 'hoodie.insert.shuffle.parallelism': 5, - 'hoodie.upsert.shuffle.parallelism': 5 + "hoodie.bulkinsert.shuffle.parallelism": 5, + "hoodie.insert.shuffle.parallelism": 5, + "hoodie.upsert.shuffle.parallelism": 5, } fg.insert(df, write_options=write_options) ``` diff --git a/docs/user_guides/fs/feature_group/create_external.md b/docs/user_guides/fs/feature_group/create_external.md index e0a779a5e6..7b5e7c2481 100644 --- a/docs/user_guides/fs/feature_group/create_external.md +++ b/docs/user_guides/fs/feature_group/create_external.md @@ -48,13 +48,14 @@ Once you have defined the metadata, you can GROUP BY ss_store_sk, sales_date """ - fg = feature_store.create_external_feature_group(name="sales", + fg = feature_store.create_external_feature_group( + name="sales", version=1, description="Physical shop sales features", query=query, data_source=ds, - primary_key=['ss_store_sk'], - event_time='sale_date' + primary_key=["ss_store_sk"], + event_time="sale_date", ) fg.save() @@ -65,13 +66,14 @@ Once you have defined the metadata, you can === "Python" ```python - fg = feature_store.create_external_feature_group(name="sales", + fg = feature_store.create_external_feature_group( + name="sales", version=1, description="Physical shop sales features", data_format="parquet", data_source=ds, - primary_key=['ss_store_sk'], - event_time='sale_date' + primary_key=["ss_store_sk"], + event_time="sale_date", ) fg.save() @@ -108,14 +110,15 @@ For an external feature group to be available online, during the creation of the ```python external_fg = fs.create_external_feature_group( - name="sales", - version=1, - description="Physical shop sales features", - query=query, - data_source=ds, - primary_key=['ss_store_sk'], - event_time='sale_date', - online_enabled=True) + name="sales", + version=1, + description="Physical shop sales features", + query=query, + data_source=ds, + primary_key=["ss_store_sk"], + event_time="sale_date", + online_enabled=True, + ) external_fg.save() # read from external storage and filter data to sync to online diff --git a/docs/user_guides/fs/feature_group/create_spine.md b/docs/user_guides/fs/feature_group/create_spine.md index 0efba91c2f..75434fcfd1 100644 --- a/docs/user_guides/fs/feature_group/create_spine.md +++ b/docs/user_guides/fs/feature_group/create_spine.md @@ -28,9 +28,9 @@ Additionally, apart from primary key and event time information, a Spark datafra name="spine_transactions", version=1, description="Transaction data", - primary_key=['cc_num'], - event_time='datetime', - dataframe=trans_df + primary_key=["cc_num"], + event_time="datetime", + dataframe=trans_df, ) ``` diff --git a/docs/user_guides/fs/feature_group/data_types.md b/docs/user_guides/fs/feature_group/data_types.md index d5b42395c0..c3def54111 100644 --- a/docs/user_guides/fs/feature_group/data_types.md +++ b/docs/user_guides/fs/feature_group/data_types.md @@ -166,10 +166,15 @@ The byte size of each column is determined by its data type and calculated as fo For online enabled feature groups, the dataframe to be ingested needs to adhere to the online schema definitions. The input dataframe is validated for schema checks accordingly. The validation is enabled by default and can be disabled by setting below key word argument when calling `insert()` + === "Python" + ```python - feature_group.insert(df, validation_options={'online_schema_validation':False}) + feature_group.insert( + df, validation_options={"online_schema_validation": False} + ) ``` + The most important validation checks or error messages are mentioned below along with possible corrective actions. 01. Primary key contains null values @@ -179,20 +184,21 @@ The most important validation checks or error messages are mentioned below along Alternatively, find the null values and assign them an unique value as per preferred strategy for data imputation. === "Pandas" + ```python # Drop rows: assuming 'id' is the primary key column - df = df.dropna(subset=['id']) + df = df.dropna(subset=["id"]) # For composite keys - df = df.dropna(subset=['id1', 'id2']) + df = df.dropna(subset=["id1", "id2"]) # Data imputation: replace null values with incrementing last integer id # existing max id - max_id = df['id'].max() + max_id = df["id"].max() # counter to generate new id next_id = max_id + 1 # for each null id, assign the next id incrementally - for idx in df[df['id'].isna()].index: - df.loc[idx, 'id'] = next_id + for idx in df[df["id"].isna()].index: + df.loc[idx, "id"] = next_id next_id += 1 ``` @@ -202,9 +208,10 @@ The most important validation checks or error messages are mentioned below along - **Example correction** Add all the primary key columns in the dataframe. === "Pandas" + ```python # incrementing primary key upto the length of dataframe - df['id'] = range(1, len(df) + 1) + df["id"] = range(1, len(df) + 1) ``` 03. String length exceeded @@ -216,34 +223,40 @@ The most important validation checks or error messages are mentioned below along - Trim the string values to fit within maximum limit set during feature group creation. === "Pandas" + ```python max_length = 100 - df['text_column'] = df['text_column'].str.slice(0, max_length) + df["text_column"] = df["text_column"].str.slice(0, max_length) ``` - Another option is to simply [create new version of the feature group][hsfs.feature_store.FeatureStore.get_or_create_feature_group] and insert the dataframe. - !!!note + !!! note The total row size limit should be less than 30kb as per [row size restrictions](#online-restrictions-for-row-size). In such cases it is possible to define the feature as **TEXT** or **BLOB**. Below is an example of explicitly defining the string column as TEXT as online type. === "Pandas" + ```python import pandas as pd + # example dummy dataframe with the string column - df = pd.DataFrame(columns=['id', 'string_col']) + df = pd.DataFrame(columns=["id", "string_col"]) from hsfs.feature import Feature + features = [ - Feature(name="id",type="bigint",online_type="bigint"), - Feature(name="string_col",type="string",online_type="text") + Feature(name="id", type="bigint", online_type="bigint"), + Feature(name="string_col", type="string", online_type="text"), ] - fg = fs.get_or_create_feature_group(name="fg_manual_text_schema", - version=1, - features=features, - online_enabled=True, - primary_key=['id']) + fg = fs.get_or_create_feature_group( + name="fg_manual_text_schema", + version=1, + features=features, + online_enabled=True, + primary_key=["id"], + ) fg.insert(df) ``` @@ -279,17 +292,18 @@ If users explicitly define the schema for the feature group, Hopsworks is going You can explicitly define the feature group schema as follows: === "Python" + ```python from hsfs.feature import Feature features = [ - Feature(name="id",type="int",online_type="int"), - Feature(name="name",type="string",online_type="varchar(20)") + Feature(name="id", type="int", online_type="int"), + Feature(name="name", type="string", online_type="varchar(20)"), ] - fg = fs.create_feature_group(name="fg_manual_schema", - features=features, - online_enabled=True) + fg = fs.create_feature_group( + name="fg_manual_schema", features=features, online_enabled=True + ) fg.save(features) ``` @@ -299,12 +313,13 @@ Hopsworks supports appending additional features to an existing feature group. Adding additional features to an existing feature group is not considered a breaking change. === "Python" + ```python from hsfs.feature import Feature features = [ - Feature(name="id",type="int",online_type="int"), - Feature(name="name",type="string",online_type="varchar(20)") + Feature(name="id", type="int", online_type="int"), + Feature(name="name", type="string", online_type="varchar(20)"), ] fg = fs.get_feature_group(name="example", version=1) diff --git a/docs/user_guides/fs/feature_group/data_validation.md b/docs/user_guides/fs/feature_group/data_validation.md index f2a37c65da..1d0e370c63 100644 --- a/docs/user_guides/fs/feature_group/data_validation.md +++ b/docs/user_guides/fs/feature_group/data_validation.md @@ -109,7 +109,7 @@ In order to define and validate an expectation when writing to a Feature Group, Connect the client running your notebooks to Hopsworks. -```python3 +```python import hopsworks project = hopsworks.login() @@ -124,10 +124,13 @@ The `fs` Feature Store entity is now ready to be used to insert or read data fro Load your data in a DataFrame using the usual pandas API. -```python3 +```python import pandas as pd -df = pd.read_csv("https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv", parse_dates=["datetime"]) +df = pd.read_csv( + "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv", + parse_dates=["datetime"], +) df.head(3) ``` @@ -143,7 +146,7 @@ Everything is done using the Great Expectations API so you can re-use any prior Create (or import an existing) expectation suite using the Great Expectations library. This suite will hold all the validation tests we want to perform on our data before inserting them into Hopsworks. -```python3 +```python import great_expectations as ge expectation_suite = ge.core.ExpectationSuite( @@ -156,26 +159,18 @@ expectation_suite = ge.core.ExpectationSuite( Add some expectation to your suite. Each expectation configuration corresponds to a validation test to be run against your data. -```python3 +```python expectation_suite.add_expectation( ge.core.ExpectationConfiguration( expectation_type="expect_column_min_to_be_between", - kwargs={ - "column": "foo_id", - "min_value": 0, - "max_value": 1 - } + kwargs={"column": "foo_id", "min_value": 0, "max_value": 1}, ) ) expectation_suite.add_expectation( ge.core.ExpectationConfiguration( expectation_type="expect_column_value_lengths_to_be_between", - kwargs={ - "column": "bar_name", - "min_value": 3, - "max_value": 10 - } + kwargs={"column": "bar_name", "min_value": 3, "max_value": 10}, ) ) ``` @@ -185,7 +180,7 @@ expectation_suite.add_expectation( Building Expectation Suite by hand can be a major time commitment when you have dozens of Features. Great Expectations offers `Profiler` classes to inspect a sample of your data and infers a suitable Expectation Suite that you will be able to register with Hopsworks. -```python3 +```python ge_profiler = ge.profile.BasicSuiteBuilderProfiler() expectation_suite_profiler, _ = ge_profiler.profile(ge.from_pandas(df)) ``` @@ -199,20 +194,20 @@ Once a Feature Group is registered in the Feature Store, you can use it to inser For more information see [create Feature Group](create.md). To benefit from automatic validation on insertion, attach your newly created Expectation Suite when creating the Feature Group: -```python3 +```python fg = fs.create_feature_group( - "fg_with_data_validation", - version=1, - description="Validated data", - primary_key=['foo_id'], - online_enabled=False, - expectation_suite=expectation_suite + "fg_with_data_validation", + version=1, + description="Validated data", + primary_key=["foo_id"], + online_enabled=False, + expectation_suite=expectation_suite, ) ``` or, if the Feature Group already exist, you can simply run: -```python3 +```python fg.save_expectation_suite(expectation_suite) ``` @@ -220,7 +215,7 @@ That is all there is to it. Hopsworks will now automatically use your suite to validate the DataFrames you want to write to the Feature Group. Try it out! -```python3 +```python job, validation_report = fg.insert(df.head(5)) ``` @@ -242,7 +237,7 @@ As you can see, your Feature Group conveniently gather all in one place: your da Hopsworks client API allows you to retrieve validation reports for further analysis. -```python3 +```python # load multiple reports validation_reports = fg.get_validation_reports() @@ -252,10 +247,8 @@ ge_latest_report = fg.get_latest_validation_report() Similarly you can retrieve the historic of validation results for a particular expectation, e.g to plot a time-series of a given expectation observed value over time. -```python3 -validation_history = fg.get_validation_history( - expectationId=1 -) +```python +validation_history = fg.get_validation_history(expectationId=1) ``` You can find the expectationIds in the UI or using `fg.get_expectation_suite` and looking it up in the expectation's meta field. diff --git a/docs/user_guides/fs/feature_group/data_validation_advanced.md b/docs/user_guides/fs/feature_group/data_validation_advanced.md index e3526a41ae..86c63e731c 100644 --- a/docs/user_guides/fs/feature_group/data_validation_advanced.md +++ b/docs/user_guides/fs/feature_group/data_validation_advanced.md @@ -23,8 +23,8 @@ Go to the Feature Group edit page, in the Expectation section you can choose bet #### Validation Ingestion Policy in Python -```python3 -fg.expectation_suite.validation_ingestion_policy = "ALWAYS" # "STRICT" +```python +fg.expectation_suite.validation_ingestion_policy = "ALWAYS" # "STRICT" ``` If your suite is registered with Hopsworks, it will persist the change to the server. @@ -44,15 +44,15 @@ This will be used as the default option but can be overridden via the API. To disable data validation until further notice in the API, you can update the `run_validation` field of the expectation suite. If your suite is registered with Hopsworks, this will persist the change to the server. -```python3 +```python fg.expectation_suite.run_validation = False ``` If you wish to override the default behaviour of the suite when inserting data in the Feature Group, you can do so via the `validate_options` kwarg. The example below will enable validation for this insertion only. -```python3 -fg.insert(df_to_validate, validation_options={"run_validation" : True}) +```python +fg.insert(df_to_validate, validation_options={"run_validation": True}) ``` We recommend to avoid using this option in scheduled job as it silently changes the expected behaviour that is displayed in the UI and prevents changes to the default behaviour to change the behaviour of the job. @@ -79,20 +79,18 @@ Note that you must have inserted data in the FG and attached the expectation sui Get an expectation with a given id: -```python3 +```python my_expectation = fg.expectation_suite.get_expectation( - expectation_id = my_expectation_id + expectation_id=my_expectation_id ) ``` Add a new expectation: -```python3 +```python new_expectation = ge.core.ExpectationConfiguration( expectation_type="expect_column_values_not_to_be_null", - kwargs={ - "mostly": 1 - } + kwargs={"mostly": 1}, ) fg.expectation_suite.add_expectation(new_expectation) @@ -100,7 +98,7 @@ fg.expectation_suite.add_expectation(new_expectation) Edit expectation kwargs of an existing expectation : -```python3 +```python existing_expectation = fg.expectation_suite.get_expectation( expectation_id=existing_expectation_id ) @@ -112,7 +110,7 @@ fg.expectation_suite.replace_expectation(existing_expectation) Remove an expectation: -```python3 +```python fg.expectation_suite.remove_expectation( expectation_id=id_of_expectation_to_delete ) @@ -120,7 +118,7 @@ fg.expectation_suite.remove_expectation( If you want to deal only with the Great Expectation API: -```python3 +```python my_suite = fg.get_expectation_suite() my_suite.add_expectation(new_expectation) @@ -139,7 +137,7 @@ The UI does not currently support upload of a validation report. #### Save Validation Reports in Python -```python3 +```python fg.save_validation_report(ge_report) ``` @@ -155,7 +153,7 @@ One tab allows you to check the report history with general information, while t #### Monitor and Fetch Validation Reports in Python -```python3 +```python # convenience method for rapid development ge_latest_report = fg.get_latest_validation_report() # fetching the latest summary prints a link to the UI @@ -178,7 +176,7 @@ The button will launch a job which will read the Feature Group data, run validat #### Validate Your Data Manually in Python -```python3 +```python ge_report = fg.validate(df, ingestion_result="EXPERIMENT") # set the save_report parameter to False to skip uploading the report to Hopsworks @@ -188,13 +186,13 @@ ge_report = fg.validate(df, ingestion_result="EXPERIMENT") If you want to apply validation to the data already in the Feature Group you can call the `.validate` without providing data. It will read the data in the Feature Group. -```python3 +```python report = fg.validate() ``` As validation objects returned by Hopsworks are native Great Expectation objects you can run validation using the usual Great Expectations syntax: -```python3 +```python ge_df = ge.from_pandas(df, expectation_suite=fg.get_expectation_suite()) ge_report = ge_df.validate() ``` diff --git a/docs/user_guides/fs/feature_group/data_validation_best_practices.md b/docs/user_guides/fs/feature_group/data_validation_best_practices.md index 64d36fa2da..9274ebebe6 100644 --- a/docs/user_guides/fs/feature_group/data_validation_best_practices.md +++ b/docs/user_guides/fs/feature_group/data_validation_best_practices.md @@ -15,13 +15,17 @@ As often with data validation, the best piece of advice is to set it up early in Use this phase to build a history you can then use when it becomes time to set quality requirements for a project in production. We made a code snippet to help you get started quickly: -```python3 +```python # Load sample data. # Replace it with your own! -my_data_df = pd.read_csv("https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv") +my_data_df = pd.read_csv( + "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv" +) # Use Great Expectation profiler (ignore deprecation warning) -expectation_suite_profiled, validation_report = ge.from_pandas(my_data_df).profile(profiler=ge.profile.BasicSuiteBuilderProfiler) +expectation_suite_profiled, validation_report = ge.from_pandas( + my_data_df +).profile(profiler=ge.profile.BasicSuiteBuilderProfiler) # Create a Feature Group on hopsworks with an expectation suite attached. # Don't forget to change the primary key! @@ -29,13 +33,14 @@ my_validated_data_fg = fs.get_or_create_feature_group( name="my_validated_data_fg", version=1, description="My data", - primary_key=['cc_num'], - expectation_suite=expectation_suite_profiled) + primary_key=["cc_num"], + expectation_suite=expectation_suite_profiled, +) ``` Any data you insert in the Feature Group from now will be validated and a report will be uploaded to Hopsworks. -```python3 +```python # Insert and validate your data insert_job, validation_report = my_validated_data_fg.insert(my_data_df) ``` @@ -73,10 +78,8 @@ Below are some simple tips and snippets to make the most of your data validation Whether you use an existing or create a new (recommended) Feature Group for production, we recommend you set the validation ingestion policy of your Expectation Suite to `"STRICT"`. -```python3 -fg_prod.save_expectation_suite( - my_suite, - validation_ingestion_policy="STRICT") +```python +fg_prod.save_expectation_suite(my_suite, validation_ingestion_policy="STRICT") ``` In this setup, Hopsworks will abort inserting a DataFrame that does not successfully fulfill all expectations in the attached Expectation Suite. @@ -87,7 +90,7 @@ This ensures data quality standards are upheld for every insertion and provide d Aborting insertions of DataFrames which do not satisfy the data quality standards can lead to data loss in your materialization job. To avoid such loss we recommend creating a duplicate Feature Group with the same Expectation Suite in `"ALWAYS"` mode which will hold the rejected data. -```python3 +```python job, report = fg_prod.insert(df) if report["success"] is False: @@ -99,17 +102,17 @@ if report["success"] is False: You can easily retrieve the validation history of a specific expectation to export it to your favourite visualisation tool. You can filter on time and on whether insertion was successful or not. -```python3 +```python validation_history = fg.get_validation_history( - expectation_id=my_id, - filters=["REJECTED", "UNKNOWN"], - ge_type=False + expectation_id=my_id, filters=["REJECTED", "UNKNOWN"], ge_type=False ) timeseries = pd.DataFrame( { - "observed_value": [res.result["observed_value"] for res in validation_history], - "validation_time": [res.validation_time for res in validation_history] + "observed_value": [ + res.result["observed_value"] for res in validation_history + ], + "validation_time": [res.validation_time for res in validation_history], } ) diff --git a/docs/user_guides/fs/feature_group/deprecation.md b/docs/user_guides/fs/feature_group/deprecation.md index 531c11a3b0..c0b11079d0 100644 --- a/docs/user_guides/fs/feature_group/deprecation.md +++ b/docs/user_guides/fs/feature_group/deprecation.md @@ -25,7 +25,9 @@ To deprecate a feature group using the HSFS APIs you need to provide a [Feature === "Python" ```python - fg = fs.get_feature_group(name="feature_group_name", version=feature_group_version) + fg = fs.get_feature_group( + name="feature_group_name", version=feature_group_version + ) ``` ### Deprecate Feature Group diff --git a/docs/user_guides/fs/feature_group/feature_monitoring.md b/docs/user_guides/fs/feature_group/feature_monitoring.md index 509d741c93..ccb8201e32 100644 --- a/docs/user_guides/fs/feature_group/feature_monitoring.md +++ b/docs/user_guides/fs/feature_group/feature_monitoring.md @@ -39,7 +39,7 @@ Connect the client running your notebooks to Hopsworks. === "Python" - ```python3 + ```python import hopsworks project = hopsworks.login() @@ -60,7 +60,7 @@ The following is a code example for getting or creating a Feature Group with nam === "Python" - ```python3 + ```python # Retrieve an existing feature group trans_fg = fs.get_feature_group("trans_fg", version=1) @@ -83,7 +83,7 @@ You can setup statistics monitoring on a ==single feature or multiple features== === "Python" - ```python3 + ```python # compute statistics for all the features fg_monitoring_config = trans_fg.create_statistics_monitoring( name="trans_fg_all_features_monitoring", @@ -105,7 +105,7 @@ You can create multiple feature monitoring configurations for the same Feature G === "Python" - ```python3 + ```python fg_monitoring_config = trans_fg.create_feature_monitoring( name="trans_fg_amount_monitoring", feature_name="amount", @@ -120,12 +120,12 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d === "Python" - ```python3 + ```python fg_monitoring_config = trans_fg.create_statistics_monitoring( name="trans_fg_all_features_monitoring", description="Compute statistics on all data of all features of the Feature Group on a weekly basis", cron_expression="0 0 12 ? * MON *", # weekly - row_percentage=0.8, # use 80% of the data + row_percentage=0.8, # use 80% of the data ) # or @@ -134,7 +134,7 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d feature_name="amount", description="Compute descriptive statistics on the amount Feature of the Feature Group on a weekly basis", cron_expression="0 0 12 ? * MON *", # weekly - row_percentage=0.8, # use 80% of the data + row_percentage=0.8, # use 80% of the data ) ``` @@ -146,10 +146,10 @@ Additionally, you can specify the percentage of feature data on which statistics === "Python" - ```python3 + ```python fm_monitoring_config.with_detection_window( window_length="1w", # data ingested during one week - time_offset="1w", # starting from last week + time_offset="1w", # starting from last week row_percentage=0.8, # use 80% of the data ) ``` @@ -160,11 +160,11 @@ When setting up feature monitoring for a Feature Group, reference windows can be === "Python" - ```python3 + ```python # compare statistics against a reference window fm_monitoring_config.with_reference_window( window_length="1w", # data ingested during one week - time_offset="2w", # starting from two weeks ago + time_offset="2w", # starting from two weeks ago row_percentage=0.8, # use 80% of the data ) @@ -182,12 +182,12 @@ Then, you can define a relative or absolute threshold using the `threshold` and === "Python" - ```python3 + ```python fm_monitoring_config.compare_on( metric="mean", threshold=0.2, # a relative change over 20% is considered anomalous relative=True, # relative or absolute change - strict=False, # strict or relaxed comparison + strict=False, # strict or relaxed comparison ) ``` @@ -201,7 +201,7 @@ Once the configuration is saved, the schedule for the statistics computation and === "Python" - ```python3 + ```python fm_monitoring_config.save() ``` diff --git a/docs/user_guides/fs/feature_group/notification.md b/docs/user_guides/fs/feature_group/notification.md index 27402ed39d..b3d2010c6d 100644 --- a/docs/user_guides/fs/feature_group/notification.md +++ b/docs/user_guides/fs/feature_group/notification.md @@ -26,11 +26,12 @@ To enable Change Data Capture for an online-enabled feature group using the HSFS ```python fg = fs.create_feature_group( - name="feature_group_name", - version=feature_group_version, - primary_key=feature_group_primary_keys, - online_enabled=True, - notification_topic_name="notification_topic_name") + name="feature_group_name", + version=feature_group_version, + primary_key=feature_group_primary_keys, + online_enabled=True, + notification_topic_name="notification_topic_name", + ) ``` ### Update Feature Group with Change Data Capture topic using Python @@ -43,7 +44,8 @@ With the default configuration, it can take up to 30 minutes for these changes t ```python fg.update_notification_topic_name( - notification_topic_name="new_notification_topic_name") + notification_topic_name="new_notification_topic_name" + ) ``` ## Using UI diff --git a/docs/user_guides/fs/feature_group/on_demand_transformations.md b/docs/user_guides/fs/feature_group/on_demand_transformations.md index 8cabf1370a..9eadc8c45c 100644 --- a/docs/user_guides/fs/feature_group/on_demand_transformations.md +++ b/docs/user_guides/fs/feature_group/on_demand_transformations.md @@ -18,45 +18,53 @@ Alternatively, the name of the resulting on-demand feature can be explicitly def Each on-demand transformation function can map specific features to its arguments by explicitly providing their names as arguments to the transformation function. If no feature names are provided, the transformation function will default to using features that match the name of the transformation function's argument. -=== "Python" !!! example "Creating on-demand transformation functions." - ```python - # Define transformation function - @hopsworks.udf(return_type=int, drop=["current_date"]) - def transaction_age(transaction_date, current_date): - return (current_date - transaction_date).dt.days - - @hopsworks.udf(return_type=[str, str], drop=["current_date"]) - def stripped_strings(country, city): - return country.strip(), city.strip() - - # Attach transformation function to feature group to create on-demand transformation function. - fg = feature_store.create_feature_group(name="fg_transactions", - version=1, - description="Transaction Features", - online_enabled=True, - primary_key=['id'], - event_time='event_time', - transformation_functions=[transaction_age, stripped_strings] - ) - ``` + === "Python" + + ```python + # Define transformation function + @hopsworks.udf(return_type=int, drop=["current_date"]) + def transaction_age(transaction_date, current_date): + return (current_date - transaction_date).dt.days + + + @hopsworks.udf(return_type=[str, str], drop=["current_date"]) + def stripped_strings(country, city): + return country.strip(), city.strip() + + + # Attach transformation function to feature group to create on-demand transformation function. + fg = feature_store.create_feature_group( + name="fg_transactions", + version=1, + description="Transaction Features", + online_enabled=True, + primary_key=["id"], + event_time="event_time", + transformation_functions=[transaction_age, stripped_strings], + ) + ``` ### Specifying input features The features to be used by the on-demand transformation function can be specified by providing the feature names as input to the transformation functions. -=== "Python" !!! example "Creating on-demand transformations by specifying features to be passed to transformation function." - ```python - fg = feature_store.create_feature_group(name="fg_transactions", - version=1, - description="Transaction Features", - online_enabled=True, - primary_key=['id'], - event_time='event_time', - transformation_functions=[age_transaction('transaction_time', 'current_time')] - ) - ``` + === "Python" + + ```python + fg = feature_store.create_feature_group( + name="fg_transactions", + version=1, + description="Transaction Features", + online_enabled=True, + primary_key=["id"], + event_time="event_time", + transformation_functions=[ + age_transaction("transaction_time", "current_time") + ], + ) + ``` ## Usage @@ -77,23 +85,25 @@ Inserting on-demand features as historical features saves time and computational A feature view can include on-demand features from feature groups by selecting them in the [query](../feature_view/query.md) used to create the feature view. These on-demand features are equivalent to regular features, and [model-dependent transformations](../feature_view/model-dependent-transformations.md) can be applied to them if required. -=== "Python" !!! example "Creating feature view with on-demand features" - ```python + === "Python" - # Selecting on-demand features in query - query = fg.select(["id", "feature1", "feature2", "on_demand_feature3", "on_demand_feature4"]) + ```python + # Selecting on-demand features in query + query = fg.select( + ["id", "feature1", "feature2", "on_demand_feature3", "on_demand_feature4"] + ) - # Creating a feature view using a query that contains on-demand transformations and model-dependent transformations - feature_view = fs.create_feature_view( - name='transactions_view', + # Creating a feature view using a query that contains on-demand transformations and model-dependent transformations + feature_view = fs.create_feature_view( + name="transactions_view", query=query, transformation_functions=[ min_max_scaler("feature1"), min_max_scaler("on_demand_feature3"), - ] + ], ) - ``` + ``` ### Computing on-demand features @@ -113,66 +123,69 @@ However, if the required input parameters are also not present in the feature ve The `get_feature_vector` function retrieves a single feature vector based on the feature view's serving key(s). The on-demand features in the feature vector can be computed using real-time data by passing a dictionary that associates the name of each input parameter needed for the on-demand transformation function with its respective new value to the `request_parameter` argument. -=== "Python" !!! example "Computing on-demand features while retrieving a feature vector" - ```python - feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, - request_parameter={ - "transaction_time": datetime(2022, 12, 28, 23, 55, 59), - "current_time": datetime.now(), - }, - ) - ``` + === "Python" + + ```python + feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, + request_parameter={ + "transaction_time": datetime(2022, 12, 28, 23, 55, 59), + "current_time": datetime.now(), + }, + ) + ``` #### Retrieving feature vectors The `get_feature_vectors` function retrieves multiple feature vectors using a list of feature view serving keys. The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each serving key or can be a dictionary if the on-demand transformations require the same parameters for all serving keys. -=== "Python" !!! example "Computing on-demand features while retrieving a feature vectors" - ```python - # Specify unique request parameters for each serving key. - feature_vector = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], - request_parameter=[ - { + === "Python" + + ```python + # Specify unique request parameters for each serving key. + feature_vector = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], + request_parameter=[ + { + "transaction_time": datetime(2022, 12, 28, 23, 55, 59), + "current_time": datetime.now(), + }, + { + "transaction_time": datetime(2022, 11, 20, 12, 50, 00), + "current_time": datetime.now(), + }, + ], + ) + + # Specify common request parameters for all serving key. + feature_vector = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], + request_parameter={ "transaction_time": datetime(2022, 12, 28, 23, 55, 59), "current_time": datetime.now(), }, - { - "transaction_time": datetime(2022, 11, 20, 12, 50, 00), - "current_time": datetime.now(), - }, - ], - ) - - # Specify common request parameters for all serving key. - feature_vector = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], - request_parameter={ - "transaction_time": datetime(2022, 12, 28, 23, 55, 59), - "current_time": datetime.now(), - }, - ) - ``` + ) + ``` #### Retrieving feature vector without on-demand features The `get_feature_vector` and `get_feature_vectors` methods can return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features. To achieve this, set the parameters `transform` and `on_demand_features` to `False`. -=== "Python" !!! example "Returning untransformed feature vectors" - ```python - untransformed_feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, transform=False, on_demand_features=False - ) - untransformed_feature_vectors = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False - ) - ``` + === "Python" + + ```python + untransformed_feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, transform=False, on_demand_features=False + ) + untransformed_feature_vectors = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False + ) + ``` #### Compute all on-demand features @@ -182,75 +195,78 @@ The `transform` function can be used to apply model-dependent transformations to The `request_parameter` in this case, can be a list of dictionaries that specifies the input parameters for the computation of on-demand features for each feature vector given as input to the function or can be a dictionary if the on-demand transformations require the same parameters for all input feature vectors. -=== "Python" !!! example "Computing all on-demand features and manually applying model dependent transformations." - ```python - # Specify request parameters for each serving key. - untransformed_feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, transform=False, on_demand_features=False - ) - - # re-compute and add on-demand features to the feature vector - feature_vector_with_on_demand_features = fv.compute_on_demand_features( - untransformed_feature_vector, - request_parameter={ - "transaction_time": datetime(2022, 12, 28, 23, 55, 59), - "current_time": datetime.now(), - }, - ) - - # Applying model dependent transformations - encoded_feature_vector = fv.transform(feature_vector_with_on_demand_features) - - # Specify request parameters for each serving key. - untransformed_feature_vectors = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False - ) - - # re-compute and add on-demand features to the feature vectors - Specify unique request parameter for each feature vector - feature_vectors_with_on_demand_features = fv.compute_on_demand_features( - untransformed_feature_vectors, - request_parameter=[ - { + === "Python" + + ```python + # Specify request parameters for each serving key. + untransformed_feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, transform=False, on_demand_features=False + ) + + # re-compute and add on-demand features to the feature vector + feature_vector_with_on_demand_features = fv.compute_on_demand_features( + untransformed_feature_vector, + request_parameter={ "transaction_time": datetime(2022, 12, 28, 23, 55, 59), "current_time": datetime.now(), }, - { - "transaction_time": datetime(2022, 11, 20, 12, 50, 00), - "current_time": datetime.now(), - }, - ], - ) + ) - # re-compute and add on-demand feature to the feature vectors - Specify common request parameter for all feature vectors - feature_vectors_with_on_demand_features = fv.compute_on_demand_features( - untransformed_feature_vectors, - request_parameter={ - "transaction_time": datetime(2022, 12, 28, 23, 55, 59), - "current_time": datetime.now(), - }, - ) + # Applying model dependent transformations + encoded_feature_vector = fv.transform(feature_vector_with_on_demand_features) - # Applying model dependent transformations - encoded_feature_vector = fv.transform(feature_vectors_with_on_demand_features) + # Specify request parameters for each serving key. + untransformed_feature_vectors = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False + ) - ``` + # re-compute and add on-demand features to the feature vectors - Specify unique request parameter for each feature vector + feature_vectors_with_on_demand_features = fv.compute_on_demand_features( + untransformed_feature_vectors, + request_parameter=[ + { + "transaction_time": datetime(2022, 12, 28, 23, 55, 59), + "current_time": datetime.now(), + }, + { + "transaction_time": datetime(2022, 11, 20, 12, 50, 00), + "current_time": datetime.now(), + }, + ], + ) + + # re-compute and add on-demand feature to the feature vectors - Specify common request parameter for all feature vectors + feature_vectors_with_on_demand_features = fv.compute_on_demand_features( + untransformed_feature_vectors, + request_parameter={ + "transaction_time": datetime(2022, 12, 28, 23, 55, 59), + "current_time": datetime.now(), + }, + ) + + # Applying model dependent transformations + encoded_feature_vector = fv.transform(feature_vectors_with_on_demand_features) + ``` #### Compute one on-demand feature On-demand transformation functions can also be accessed and executed as normal functions by using the dictionary `on_demand_transformations` that maps the on-demand features to their corresponding on-demand transformation function. -=== "Python" !!! example "Executing each on-demand transformation function" - ```python - # Specify request parameters for each serving key. - feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, transform=False, on_demand_features=False, return_type="pandas" - ) - - # Applying model dependent transformations - feature_vector["on_demand_feature1"] = fv.on_demand_transformations[ - "on_demand_feature1" - ](feature_vector["transaction_time"], datetime.now()) - - ``` + === "Python" + + ```python + # Specify request parameters for each serving key. + feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, + transform=False, + on_demand_features=False, + return_type="pandas", + ) + + # Applying model dependent transformations + feature_vector["on_demand_feature1"] = fv.on_demand_transformations[ + "on_demand_feature1" + ](feature_vector["transaction_time"], datetime.now()) + ``` diff --git a/docs/user_guides/fs/feature_group/online_ingestion_observability.md b/docs/user_guides/fs/feature_group/online_ingestion_observability.md index 08305c6fe9..734b3e120a 100644 --- a/docs/user_guides/fs/feature_group/online_ingestion_observability.md +++ b/docs/user_guides/fs/feature_group/online_ingestion_observability.md @@ -28,7 +28,7 @@ First, create an online-enabled feature group and insert data into it: name="feature_group_name", version=feature_group_version, primary_key=feature_group_primary_keys, - online_enabled=True + online_enabled=True, ) fg.insert(fg_df) diff --git a/docs/user_guides/fs/feature_group/statistics.md b/docs/user_guides/fs/feature_group/statistics.md index a2543a0442..cad56e2a46 100644 --- a/docs/user_guides/fs/feature_group/statistics.md +++ b/docs/user_guides/fs/feature_group/statistics.md @@ -45,20 +45,21 @@ By default the value is empty list `[]` and the statistics are computed for all === "Python" ```python - fg = feature_store.create_feature_group(name="weather", + fg = feature_store.create_feature_group( + name="weather", version=1, description="Weather Features", online_enabled=True, - primary_key=['location_id'], - partition_key=['day'], - event_time='event_time', + primary_key=["location_id"], + partition_key=["day"], + event_time="event_time", statistics_config={ "enabled": True, "histograms": True, "correlations": True, "exact_uniqueness": False, - "columns": [] - } + "columns": [], + }, ) ``` @@ -75,7 +76,7 @@ Either to add or remove a class of statistics, or to change the set of features "histograms": False, "correlations": False, "exact_uniqueness": False, - "columns": ['location_id', 'min_temp', 'max_temp'] + "columns": ["location_id", "min_temp", "max_temp"], } fg.update_statistics_config() @@ -98,7 +99,7 @@ As external feature groups are read only from an Hopsworks perspective, statisti === "Python" ```python - fg.compute_statistics(wallclock_time='20220611 20:00') + fg.compute_statistics(wallclock_time="20220611 20:00") ``` ## Inspect statistics diff --git a/docs/user_guides/fs/feature_group/ttl.md b/docs/user_guides/fs/feature_group/ttl.md index f673605e36..5a8b7e55d0 100644 --- a/docs/user_guides/fs/feature_group/ttl.md +++ b/docs/user_guides/fs/feature_group/ttl.md @@ -27,6 +27,7 @@ Data rows where `event_time` is older than the TTL period will be automatically ```python from datetime import datetime, timezone + import pandas as pd # Assume you already have a feature store handle diff --git a/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md b/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md index d820d1ef8f..c29d2be7a1 100644 --- a/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md +++ b/docs/user_guides/fs/feature_monitoring/feature_monitoring_advanced.md @@ -15,7 +15,7 @@ You can retrieve one or more feature monitoring configurations from the Feature === "Python" - ```python3 + ```python # retrieve all configurations fm_configs = trans_fg.get_feature_monitoring_configs() # from a feature group fm_configs = trans_fv.get_feature_monitoring_configs() # or a feature view @@ -52,7 +52,7 @@ You can easily enable or disable a specific feature monitoring configuration usi === "Python" - ```python3 + ```python # disable a specific feature monitoring configuration fm_config.disable() @@ -79,7 +79,7 @@ To trigger the feature monitoring job once from the Python API, use the feature === "Python" - ```python3 + ```python # run the feature monitoring job once fm_config.run_job() ``` @@ -99,7 +99,7 @@ Alternatively, you can retrieve all the statistics and comparison results using === "Python" - ```python3 + ```python # retrieve all feature monitoring results from a specific config fm_results = fm_config.get_history() @@ -120,6 +120,6 @@ You can delete feature monitoring configurations using the Python API only, as s === "Python" - ```python3 + ```python fm_config.delete() ``` diff --git a/docs/user_guides/fs/feature_monitoring/index.md b/docs/user_guides/fs/feature_monitoring/index.md index 1916ec18e7..49757e5184 100644 --- a/docs/user_guides/fs/feature_monitoring/index.md +++ b/docs/user_guides/fs/feature_monitoring/index.md @@ -18,7 +18,7 @@ Hopsworks feature monitoring user interface is centered around two functionaliti To enable feature monitoring in Hopsworks, you need to set the `enable_feature_monitoring` [configuration option](../../../setup_installation/admin/variables.md) to `true`. This can also be achieved in the cluster definition by setting the following attribute: - ``` + ```yaml hopsworks: enable_feature_monitoring: "true" ``` diff --git a/docs/user_guides/fs/feature_view/batch-data.md b/docs/user_guides/fs/feature_view/batch-data.md index c41b9611a8..ac20b86a28 100644 --- a/docs/user_guides/fs/feature_view/batch-data.md +++ b/docs/user_guides/fs/feature_view/batch-data.md @@ -7,14 +7,16 @@ Feature views support batch prediction by returning batch data as a DataFrame ov The resultant DataFrame (or batch-scoring DataFrame) can then be fed to models to make predictions. === "Python" + ```python # get batch data df = feature_view.get_batch_data( - start_time = "20220620", - end_time = "20220627" - ) # return a dataframe + start_time="20220620", end_time="20220627" + ) # return a dataframe ``` + === "Java" + ```java Dataset ds = featureView.getBatchData("20220620", "20220627") ``` @@ -27,14 +29,15 @@ Primary key(s) and event time are not usually included in the feature view query To retrieve the primary key(s) and/or event time when retrieving batch data for inference, you need to set the parameters `primary_key=True` and/or `event_time=True`. === "Python" + ```python # get batch data df = feature_view.get_batch_data( - start_time = "20220620", - end_time = "20220627", - primary_key=True, - event_time=True - ) # return a dataframe with primary keys and event time + start_time="20220620", + end_time="20220627", + primary_key=True, + event_time=True, + ) # return a dataframe with primary keys and event time ``` !!! note All primary and event time columns of all the feature groups included in the feature view will be returned. @@ -47,9 +50,7 @@ If the service is enabled, and you want to read this particular batch data with ```python # get batch data with Hive df = feature_view.get_batch_data( - start_time = "20220620", - end_time = "20220627", - read_options={"use_hive": True} + start_time="20220620", end_time="20220627", read_options={"use_hive": True} ) ``` @@ -71,20 +72,24 @@ It is important to note that in addition to the filters defined in feature view, By default, the `get_batch_data` function returns batch data with model-dependent transformations applied. However, you can retrieve untransformed batch data—while still including on-demand features—by setting the `transform` parameter to `False`. -=== "Python" !!! example "Returning untransformed batch data" - ```python - # Fetching untransformed batch data. - untransformed_batch_data = feature_view.get_batch_data(transform=False) - ``` + === "Python" + + ```python + # Fetching untransformed batch data. + untransformed_batch_data = feature_view.get_batch_data(transform=False) + ``` ## Passing Context Variables to Transformation Functions After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the necessary context variables through the `transformation_context` parameter when fetching batch data. -=== "Python" - !!! example "Passing context variables while fetching batch data." +!!! example "Passing context variables while fetching batch data." + === "Python" + ```python # Passing context variable to IN-MEMORY Training Dataset. - batch_data = feature_view.get_batch_data(transformation_context={"context_parameter":10}) + batch_data = feature_view.get_batch_data( + transformation_context={"context_parameter": 10} + ) ``` diff --git a/docs/user_guides/fs/feature_view/feature-vectors.md b/docs/user_guides/fs/feature_view/feature-vectors.md index 5a712a61e8..c16778a532 100644 --- a/docs/user_guides/fs/feature_view/feature-vectors.md +++ b/docs/user_guides/fs/feature_view/feature-vectors.md @@ -20,22 +20,19 @@ Alternative, you can provide the primary key of the feature groups as the key of It is also possible to provide a subset of the entry, which will be discussed [below](#partial-feature-retrieval). === "Python" + ```python # get a single vector - feature_view.get_feature_vector( - entry = {"pk1": 1, "pk2": 2} - ) + feature_view.get_feature_vector(entry={"pk1": 1, "pk2": 2}) # get multiple vectors feature_view.get_feature_vectors( - entry = [ - {"pk1": 1, "pk2": 2}, - {"pk1": 3, "pk2": 4}, - {"pk1": 5, "pk2": 6} - ] + entry=[{"pk1": 1, "pk2": 2}, {"pk1": 3, "pk2": 4}, {"pk1": 5, "pk2": 6}] ) ``` + === "Java" + ```java // get a single vector Map entry1 = Maps.newHashMap(); @@ -87,14 +84,16 @@ In the example, it is 1 because `right_fg` is in the first join in the query `le It can happen that some of the primary key entries are not available in some or all of the feature groups used by a feature view. Take the above example assuming the feature view consists of two joined feature groups, first one with primary key column `pk1`, the second feature group with primary key column `pk2`. + === "Python" + ```python # get a single vector - feature_view.get_feature_vector( - entry = {"pk1": 1, "pk2": 2} - ) + feature_view.get_feature_vector(entry={"pk1": 1, "pk2": 2}) ``` + === "Java" + ```java // get a single vector Map entry1 = Maps.newHashMap(); @@ -102,21 +101,22 @@ Take the above example assuming the feature view consists of two joined feature entry1.put("pk2", 2); featureView.getFeatureVector(entry1); ``` + This call will raise an exception if `pk1 = 1` OR `pk2 = 2` can't be found but also if `pk1 = 1` AND `pk2 = 2` can't be found, meaning, it will not return a partial or empty feature vector. When retrieving a batch of vectors, the behaviour is slightly different. + === "Python" + ```python # get multiple vectors feature_view.get_feature_vectors( - entry = [ - {"pk1": 1, "pk2": 2}, - {"pk1": 3, "pk2": 4}, - {"pk1": 5, "pk2": 6} - ] + entry=[{"pk1": 1, "pk2": 2}, {"pk1": 3, "pk2": 4}, {"pk1": 5, "pk2": 6}] ) ``` + === "Java" + ```java // get multiple vectors Map entry2 = Maps.newHashMap(); @@ -127,6 +127,7 @@ When retrieving a batch of vectors, the behaviour is slightly different. entry3.put("pk2", 6); featureView.getFeatureVectors(Lists.newArrayList(entry1, entry2, entry3)); ``` + This call will raise an exception if for example for the third entry `pk1 = 5` OR `pk2 = 6` can't be found, however, it will simply not return a vector for this entry if `pk1 = 5` AND `pk2 = 6` can't be found. That means, `get_feature_vectors` will never return partial feature vector, but will omit empty feature vectors. @@ -140,20 +141,18 @@ In the example below, let's say you join 2 feature groups by `fg1.join(fg2, left If `pk2` is not provided, this returns feature values from the first feature group and null values from the second feature group when using the option `allow_missing=True`, otherwise it raises exception. === "Python" + ```python # get a single vector with - feature_view.get_feature_vector( - entry = {"pk1": 1}, - allow_missing=True - ) + feature_view.get_feature_vector(entry={"pk1": 1}, allow_missing=True) # get multiple vectors feature_view.get_feature_vectors( - entry = [ + entry=[ {"pk1": 1}, {"pk1": 3}, ], - allow_missing=True + allow_missing=True, ) ``` @@ -165,6 +164,7 @@ Then you can follow the above examples and retrieve the feature vectors. Please note that transformed feature vectors can only be returned in the python client but not in the java client. === "Python" + ```python feature_view.init_serving(training_dataset_version=1) ``` @@ -180,25 +180,21 @@ The feature view will apply the necessary transformations to the passed features Please note that passed features is only available in the python client but not in the java client. === "Python" + ```python # get a single vector feature_view.get_feature_vector( - entry = {"pk1": 1, "pk2": 2}, - passed_features = {"feature_a": "value_a"} + entry={"pk1": 1, "pk2": 2}, passed_features={"feature_a": "value_a"} ) # get multiple vectors feature_view.get_feature_vectors( - entry = [ - {"pk1": 1, "pk2": 2}, - {"pk1": 3, "pk2": 4}, - {"pk1": 5, "pk2": 6} - ], - passed_features = [ + entry=[{"pk1": 1, "pk2": 2}, {"pk1": 3, "pk2": 4}, {"pk1": 5, "pk2": 6}], + passed_features=[ {"feature_a": "value_a1"}, {"feature_a": "value_a2"}, {"feature_a": "value_a3"}, - ] + ], ) ``` @@ -206,6 +202,7 @@ You can also use the parameter to provide values for all the features which are In this second case, you do not have to provide the primary key value for that feature group as no data needs to be retrieved from the online feature store. === "Python" + ```python # get a single vector, replace values from an entire feature group # note how in this example you don't have to provide the value of @@ -213,12 +210,12 @@ In this second case, you do not have to provide the primary key value for that f # in this case feature_b and feature_c feature_view.get_feature_vector( - entry = { "pk1": 1 }, - passed_features = { + entry={"pk1": 1}, + passed_features={ "feature_a": "value_a", "feature_b": "value_b", - "feature_c": "value_c" - } + "feature_c": "value_c", + }, ) ``` @@ -228,48 +225,56 @@ By default, the `get_feature_vector` and `get_feature_vectors` functions return However, you can retrieve the untransformed feature vectors without applying model-dependent transformations while still including on-demand features by setting the `transform` parameter to False. -=== "Python" !!! example "Returning untransformed feature vectors" - ```python - # Fetching untransformed feature vector. - untransformed_feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, transform=False - ) + === "Python" - # Fetching untransformed feature vectors. - untransformed_feature_vectors = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], transform=False - ) - ``` + ```python + # Fetching untransformed feature vector. + untransformed_feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, transform=False + ) + + # Fetching untransformed feature vectors. + untransformed_feature_vectors = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], transform=False + ) + + + ``` ## Retrieving feature vector without on-demand features The `get_feature_vector` and `get_feature_vectors` methods can also return untransformed feature vectors without on-demand features by disabling model-dependent transformations and excluding on-demand features. To achieve this, set the parameters `transform` and `on_demand_features` to `False`. -=== "Python" !!! example "Returning untransformed feature vectors" - ```python - untransformed_feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, transform=False, on_demand_features=False - ) - untransformed_feature_vectors = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False - ) - ``` + === "Python" + + ```python + untransformed_feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, transform=False, on_demand_features=False + ) + untransformed_feature_vectors = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], transform=False, on_demand_features=False + ) + + + ``` ## Passing Context Variables to Transformation Functions After [defining a transformation function using a context variable](../transformation_functions.md#passing-context-variables-to-transformation-function), you can pass the required context variables using the `transformation_context` parameter when fetching the feature vectors. -=== "Python" - !!! example "Passing context variables while fetching batch data." +!!! example "Passing context variables while fetching batch data." + === "Python" + ```python # Passing context variable to IN-MEMORY Training Dataset. batch_data = feature_view.get_feature_vectors( - entry = [{ "pk1": 1 }], - transformation_context={"context_parameter":10} + entry=[{"pk1": 1}], transformation_context={"context_parameter": 10} ) + + ``` ## Choose the right Client @@ -298,7 +303,7 @@ my_feature_view.init_serving( init_rest_client=True, config_rest_client={ "api_key": "your_api_key", - } + }, ) ``` @@ -315,19 +320,18 @@ my_feature_view.init_serving( config_rest_client={ "api_key": "your_api_key", }, - default_client="rest" + default_client="rest", ) # this will fetch a feature vector via REST try: my_feature_view.get_feature_vector( - entry = {"pk1": 1, "pk2": 2}, + entry={"pk1": 1, "pk2": 2}, ) except TimeoutException: # if the REST client times out, the SQL client will be used my_feature_view.get_feature_vector( - entry = {"pk1": 1, "pk2": 2}, - force_sql=True + entry={"pk1": 1, "pk2": 2}, force_sql=True ) ``` diff --git a/docs/user_guides/fs/feature_view/feature_logging.md b/docs/user_guides/fs/feature_view/feature_logging.md index ac3d804e53..063e441076 100644 --- a/docs/user_guides/fs/feature_view/feature_logging.md +++ b/docs/user_guides/fs/feature_view/feature_logging.md @@ -46,10 +46,9 @@ You have a DataFrame of features you want to log. ```python import pandas as pd -features = pd.DataFrame({ - "feature1": [1.1, 2.2, 3.3], - "feature2": [4.4, 5.5, 6.6] -}) +features = pd.DataFrame( + {"feature1": [1.1, 2.2, 3.3], "feature2": [4.4, 5.5, 6.6]} +) # Log features feature_view.log(features) @@ -60,15 +59,14 @@ feature_view.log(features) You can also log predictions, and optionally the training dataset and the model used for prediction. ```python -predictions = pd.DataFrame({ - "prediction": [0, 1, 0] -}) +predictions = pd.DataFrame({"prediction": [0, 1, 0]}) # Log features and predictions -feature_view.log(features, - predictions=predictions, - training_dataset_version=1, - model=Model(1, "model", version=1) +feature_view.log( + features, + predictions=predictions, + training_dataset_version=1, + model=Model(1, "model", version=1), ) ``` @@ -137,7 +135,9 @@ Accepted date format are: `%Y-%m-%d`, `%Y-%m-%d %H`, `%Y-%m-%d %H:%M`, `%Y-%m-%d ```python # Read log entries from January 2022 -log_entries = feature_view.read_log(start_time="2022-01-01", end_time="2022-01-31") +log_entries = feature_view.read_log( + start_time="2022-01-01", end_time="2022-01-31" +) print(log_entries) ``` diff --git a/docs/user_guides/fs/feature_view/feature_monitoring.md b/docs/user_guides/fs/feature_view/feature_monitoring.md index 16c9362b3c..082ad0118a 100644 --- a/docs/user_guides/fs/feature_view/feature_monitoring.md +++ b/docs/user_guides/fs/feature_view/feature_monitoring.md @@ -39,7 +39,8 @@ In order to setup feature monitoring for a Feature View, you will need: Connect the client running your notebooks to Hopsworks. === "Python" - ```python3 + + ```python import hopsworks project = hopsworks.login() @@ -60,7 +61,7 @@ The following is a code example for getting or creating a Feature View with name === "Python" - ```python3 + ```python # Retrieve an existing feature view trans_fv = fs.get_feature_view("trans_fv", version=1) @@ -80,13 +81,13 @@ The following is a code example for creating a training dataset with two splits === "Python" - ```python3 + ```python # Create a training dataset with train and test splits _, _ = trans_fv.create_train_validation_test_split( - description = 'transactions fraud batch training dataset', - data_format = 'csv', - validation_size = 0.2, - test_size = 0.1, + description="transactions fraud batch training dataset", + data_format="csv", + validation_size=0.2, + test_size=0.1, ) ``` @@ -97,7 +98,8 @@ The following is a code example for creating a training dataset with two splits You can setup statistics monitoring on a ==single feature or multiple features== of your Feature Group data, included in your Feature View query. === "Python" - ```python3 + + ```python # compute statistics for all the features fg_monitoring_config = trans_fv.create_statistics_monitoring( name="trans_fv_all_features_monitoring", @@ -118,7 +120,8 @@ When enabling the comparison of statistics in a feature monitoring configuration You can create multiple feature monitoring configurations on the same Feature View, but each of them should point to a single feature in the Feature View query. === "Python" - ```python3 + + ```python fg_monitoring_config = trans_fv.create_feature_monitoring( name="trans_fv_amount_monitoring", feature_name="amount", @@ -132,12 +135,13 @@ By default, the computation of statistics is scheduled to run endlessly, every d You can modify the default schedule by adjusting the `cron_expression`, `start_date_time` and `end_date_time` parameters. === "Python" - ```python3 + + ```python fg_monitoring_config = trans_fv.create_statistics_monitoring( name="trans_fv_all_features_monitoring", description="Compute statistics on all data of all features of the Feature Group data on a weekly basis", cron_expression="0 0 12 ? *MON*", # weekly - row_percentage=0.8, # use 80% of the data + row_percentage=0.8, # use 80% of the data ) # or @@ -146,7 +150,7 @@ You can modify the default schedule by adjusting the `cron_expression`, `start_d feature_name="amount", description="Compute descriptive statistics on the amount Feature of the Feature Group data on a weekly basis", cron_expression="0 0 12 ? * MON *", # weekly - row_percentage=0.8, # use 80% of the data + row_percentage=0.8, # use 80% of the data ) ``` @@ -157,10 +161,11 @@ You can define a different detection window using the `window_length` and `time_ Additionally, you can specify the percentage of feature data on which statistics will be computed using the `row_percentage` parameter. === "Python" - ```python3 + + ```python fm_monitoring_config.with_detection_window( window_length="1w", # data ingested during one week - time_offset="1w", # starting from last week + time_offset="1w", # starting from last week row_percentage=0.8, # use 80% of the data ) ``` @@ -170,11 +175,12 @@ Additionally, you can specify the percentage of feature data on which statistics When setting up feature monitoring for a Feature View, reference windows can be either a regular window, a specific value (i.e., window of size 1) or a training dataset. === "Python" - ```python3 + + ```python # compare statistics against a reference window fm_monitoring_config.with_reference_window( window_length="1w", # data ingested during one week - time_offset="2w", # starting from two weeks ago + time_offset="2w", # starting from two weeks ago row_percentage=0.8, # use 80% of the data ) @@ -185,7 +191,7 @@ When setting up feature monitoring for a Feature View, reference windows can be # or a training dataset fm_monitoring_config.with_reference_training_dataset( - training_dataset_version=1, # use the training dataset used to train your production model + training_dataset_version=1, # use the training dataset used to train your production model ) ``` @@ -196,12 +202,13 @@ First, you select the metric to consider in the comparison using the `metric` pa Then, you can define a relative or absolute threshold using the `threshold` and `relative` parameters. === "Python" - ```python3 + + ```python fm_monitoring_config.compare_on( metric="mean", threshold=0.2, # a relative change over 20% is considered anomalous relative=True, # relative or absolute change - strict=False, # strict or relaxed comparison + strict=False, # strict or relaxed comparison ) ``` @@ -214,7 +221,8 @@ Finally, you can save your feature monitoring configuration by calling the `save Once the configuration is saved, the schedule for the statistics computation and comparison will be activated automatically. === "Python" - ```python3 + + ```python fm_monitoring_config.save() ``` diff --git a/docs/user_guides/fs/feature_view/helper-columns.md b/docs/user_guides/fs/feature_view/helper-columns.md index 450ea0cdcd..cbb22f305e 100644 --- a/docs/user_guides/fs/feature_view/helper-columns.md +++ b/docs/user_guides/fs/feature_view/helper-columns.md @@ -22,23 +22,26 @@ In this use case `expiry_date` is an inference helper column. It is not used for training but is necessary for computing the [on-demand feature](../../../concepts/fs/feature_group/on_demand_feature.md)`days_valid` feature. -=== "Python" +!!! example "Define inference columns for feature views." + === "Python" - !!! example "Define inference columns for feature views." ```python # define query object - query = label_fg.select("fraud_label")\ - .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"])) + query = label_fg.select("fraud_label").join( + trans_fg.select(["amount", "days_valid", "expiry_date", "category"]) + ) # define feature view with helper columns feature_view = fs.get_or_create_feature_view( - name='fv_with_helper_col', + name="fv_with_helper_col", version=1, query=query, labels=["fraud_label"], transformation_functions=transformation_functions, inference_helper_columns=["expiry_date"], ) + + ``` ### Inference Data Retrieval @@ -48,42 +51,57 @@ However, they can be optionally fetched with inference or training data. #### Batch inference -=== "Python" +!!! example "Fetch inference helper column values and compute on-demand features during batch inference." + === "Python" - !!! example "Fetch inference helper column values and compute on-demand features during batch inference." ```python - # import feature functions from feature_functions import time_delta # Fetch feature view object feature_view = fs.get_feature_view( - name='fv_with_helper_col', + name="fv_with_helper_col", version=1, ) # Fetch feature data for batch inference with helper columns - df = feature_view.get_batch_data(start_time=start_time, end_time=end_time, inference_helpers=True, event_time=True) + df = feature_view.get_batch_data( + start_time=start_time, + end_time=end_time, + inference_helpers=True, + event_time=True, + ) # compute location delta - df['days_valid'] = df.apply(lambda row: time_delta(row['expiry_date'], row['transaction_date']), axis=1) + df["days_valid"] = df.apply( + lambda row: time_delta(row["expiry_date"], row["transaction_date"]), axis=1 + ) # prepare datatame for prediction - df = df[[f.name for f in feature_view.features if not (f.label or f.inference_helper_column or f.training_helper_column)]] + df = df[ + [ + f.name + for f in feature_view.features + if not ( + f.label or f.inference_helper_column or f.training_helper_column + ) + ] + ] + + ``` #### Online inference -=== "Python" +!!! example "Fetch inference helper column values and compute on-demand features during online inference." + === "Python" - !!! example "Fetch inference helper column values and compute on-demand features during online inference." ```python - from feature_functions import time_delta # Fetch feature view object feature_view = fs.get_feature_view( - name='fv_with_helper_col', + name="fv_with_helper_col", version=1, ) @@ -98,15 +116,20 @@ However, they can be optionally fetched with inference or training data. transaction_date = ... # get previous transaction location of this credit card - inference_helper = feature_view.get_inference_helper({"cc_num": cc_num}, return_type="dict") + inference_helper = feature_view.get_inference_helper( + {"cc_num": cc_num}, return_type="dict" + ) # compute location delta - days_valid = time_delta(transaction_date, inference_helper['expiry_date']) + days_valid = time_delta(transaction_date, inference_helper["expiry_date"]) # Now get assembled feature vector for prediction - feature_vector = feature_view.get_feature_vector({"cc_num": cc_num}, - passed_features={"days_valid": days_valid} - ) + feature_vector = feature_view.get_feature_vector( + {"cc_num": cc_num}, + passed_features={"days_valid": days_valid}, + ) + + ``` ## Training Helper columns @@ -114,23 +137,26 @@ However, they can be optionally fetched with inference or training data. `training_helper_columns` are a list of feature names that are not the part of the model schema itself but are used during training for the extra information. For example one might want to use feature like `category` of the purchased product to assign different weights. -=== "Python" +!!! example "Define training helper columns for feature views." + === "Python" - !!! example "Define training helper columns for feature views." ```python # define query object - query = label_fg.select("fraud_label")\ - .join(trans_fg.select(["amount", "days_valid", "expiry_date", "category"])) + query = label_fg.select("fraud_label").join( + trans_fg.select(["amount", "days_valid", "expiry_date", "category"]) + ) # define feature view with helper columns feature_view = fs.get_or_create_feature_view( - name='fv_with_helper_col', + name="fv_with_helper_col", version=1, query=query, labels=["fraud_label"], transformation_functions=transformation_functions, - training_helper_columns=["category"] + training_helper_columns=["category"], ) + + ``` ### Training Data Retrieval @@ -138,33 +164,33 @@ For example one might want to use feature like `category` of the purchased produ When retrieving training data helper columns will be omitted. However, they can be optionally fetched. -=== "Python" +!!! example "Fetch training data with or without inference helper column values." + === "Python" - !!! example "Fetch training data with or without inference helper column values." ```python - # import feature functions from feature_functions import location_delta, time_delta # Fetch feature view object feature_view = fs.get_feature_view( - name='fv_with_helper_col', + name="fv_with_helper_col", version=1, ) # Create and training data with training helper columns TEST_SIZE = 0.2 X_train, X_test, y_train, y_test = feature_view.train_test_split( - description='transactions fraud training dataset', + description="transactions fraud training dataset", test_size=TEST_SIZE, - training_helper_columns=True + training_helper_columns=True, ) # Get existing training data with training helper columns X_train, X_test, y_train, y_test = feature_view.get_train_test_split( - training_dataset_version=1, - training_helper_columns=True + training_dataset_version=1, training_helper_columns=True ) + + ``` !!! note diff --git a/docs/user_guides/fs/feature_view/model-dependent-transformations.md b/docs/user_guides/fs/feature_view/model-dependent-transformations.md index b16972b27a..66ebfe518a 100644 --- a/docs/user_guides/fs/feature_view/model-dependent-transformations.md +++ b/docs/user_guides/fs/feature_view/model-dependent-transformations.md @@ -25,50 +25,59 @@ For instance, for the function named `add_one_multiple` that outputs multiple c The function named `add_two` that outputs a single column in the example given below, produces a single output column names as `add_two_feature`. Additionally, Hopsworks also allows users to specify custom names for transformed feature using the [`alias`](../transformation_functions.md#specifying-output-features-names-for-transformation-functions) function. -=== "Python" +!!! example "Creating model-dependent transformation functions" + === "Python" - !!! example "Creating model-dependent transformation functions" ```python # Defining a many to many transformation function. @udf(return_type=[int, int, int], drop=["feature1", "feature3"]) def add_one_multiple(feature1, feature2, feature3): - return pd.DataFrame({"add_one_feature1":feature1 + 1, "add_one_feature2":feature2 + 1, "add_one_feature3":feature3 + 1}) + return pd.DataFrame( + { + "add_one_feature1": feature1 + 1, + "add_one_feature2": feature2 + 1, + "add_one_feature3": feature3 + 1, + } + ) + # Defining a one to one transformation function. @udf(return_type=int) def add_two(feature): return feature + 2 + # Creating model-dependent transformations by attaching transformation functions to feature views. feature_view = fs.create_feature_view( - name='transactions_view', + name="transactions_view", query=query, labels=["fraud_label"], - transformation_functions=[ - add_two, - add_one_multiple - ] + transformation_functions=[add_two, add_one_multiple], ) + + ``` ### Specifying input features The features to be used by a model-dependent transformation function can be specified by providing the feature names (from the feature view / feature group) as input to the transformation functions. -=== "Python" +!!! example "Specifying input features to be passed to a model-dependent transformation function" + === "Python" - !!! example "Specifying input features to be passed to a model-dependent transformation function" ```python feature_view = fs.create_feature_view( - name='transactions_view', + name="transactions_view", query=query, labels=["fraud_label"], transformation_functions=[ add_two("feature_1"), add_two("feature_2"), - add_one_multiple("feature_5", "feature_6", "feature_7") - ] + add_one_multiple("feature_5", "feature_6", "feature_7"), + ], ) + + ``` ### Using built-in transformations @@ -76,9 +85,9 @@ The features to be used by a model-dependent transformation function can be spec Built-in transformation functions are attached in the same way. The only difference is that they can either be retrieved from the Hopsworks or imported from the `hopsworks` module. -=== "Python" +!!! example "Creating model-dependent transformation using built-in transformation functions retrieved from Hopsworks" + === "Python" - !!! example "Creating model-dependent transformation using built-in transformation functions retrieved from Hopsworks" ```python min_max_scaler = fs.get_transformation_function(name="min_max_scaler") standard_scaler = fs.get_transformation_function(name="standard_scaler") @@ -86,37 +95,46 @@ The only difference is that they can either be retrieved from the Hopsworks or i label_encoder = fs.get_transformation_function(name="label_encoder") feature_view = fs.create_feature_view( - name='transactions_view', + name="transactions_view", query=query, labels=["fraud_label"], - transformation_functions = [ + transformation_functions=[ label_encoder("category"), robust_scaler("amount"), min_max_scaler("loc_delta"), - standard_scaler("age_at_transaction") - ] + standard_scaler("age_at_transaction"), + ], ) + + ``` To attach built-in transformation functions from the `hopsworks` module they can be directly imported into the code from `hopsworks.builtin_transformations`. -=== "Python" +!!! example "Creating model-dependent transformation using built-in transformation functions imported from hopsworks" + === "Python" - !!! example "Creating model-dependent transformation using built-in transformation functions imported from hopsworks" ```python - from hopsworks.hsfs.builtin_transformations import min_max_scaler, label_encoder, robust_scaler, standard_scaler + from hopsworks.hsfs.builtin_transformations import ( + label_encoder, + min_max_scaler, + robust_scaler, + standard_scaler, + ) feature_view = fs.create_feature_view( - name='transactions_view', + name="transactions_view", query=query, labels=["fraud_label"], - transformation_functions = [ + transformation_functions=[ label_encoder("category"), robust_scaler("amount"), min_max_scaler("loc_delta"), - standard_scaler("age_at_transaction") - ] + standard_scaler("age_at_transaction"), + ], ) + + ``` ## Using Model Dependent Transformations @@ -127,18 +145,22 @@ The transformed features are organized by their output column names in alphabeti Model-dependent transformation functions can also be manually applied to a feature vector using the `transform` function. -=== "Python" +!!! example "Manually applying model-dependent transformations during online inference" + === "Python" - !!! example "Manually applying model-dependent transformations during online inference" ```python # Initialize the feature view with the correct training dataset version used for model-dependent transformations fv.init_serving(training_dataset_version) # Get untransformed feature Vector - feature_vector = fv.get_feature_vector(entry={"index":10}, transform=False, return_type="pandas") + feature_vector = fv.get_feature_vector( + entry={"index": 10}, transform=False, return_type="pandas" + ) # Apply Model Dependent transformations encoded_feature_vector = fv.transform(feature_vector) + + ``` ### Retrieving untransformed feature vector and batch inference data @@ -146,21 +168,22 @@ Model-dependent transformation functions can also be manually applied to a featu The `get_feature_vector`, `get_feature_vectors`, and `get_batch_data` methods can return untransformed feature vectors and batch data without applying model-dependent transformations while still including on-demand features. To achieve this, set the `transform` parameter to False. -=== "Python" !!! example "Returning untransformed feature vectors and batch data." - ```python - # Fetching untransformed feature vector. - untransformed_feature_vector = feature_view.get_feature_vector( - entry={"id": 1}, transform=False - ) - - # Fetching untransformed feature vectors. - untransformed_feature_vectors = feature_view.get_feature_vectors( - entry=[{"id": 1}, {"id": 2}], transform=False - ) - - # Fetching untransformed batch data. - untransformed_batch_data = feature_view.get_batch_data( - transform=False - ) - ``` + === "Python" + + ```python + # Fetching untransformed feature vector. + untransformed_feature_vector = feature_view.get_feature_vector( + entry={"id": 1}, transform=False + ) + + # Fetching untransformed feature vectors. + untransformed_feature_vectors = feature_view.get_feature_vectors( + entry=[{"id": 1}, {"id": 2}], transform=False + ) + + # Fetching untransformed batch data. + untransformed_batch_data = feature_view.get_batch_data(transform=False) + + + ``` diff --git a/docs/user_guides/fs/feature_view/overview.md b/docs/user_guides/fs/feature_view/overview.md index aa66d8f72f..9973f3ad90 100644 --- a/docs/user_guides/fs/feature_view/overview.md +++ b/docs/user_guides/fs/feature_view/overview.md @@ -20,19 +20,18 @@ For example, when a client reads a numerical feature, the feature value could be ```python # create a simple feature view - feature_view = fs.create_feature_view( - name='transactions_view', - query=query - ) + feature_view = fs.create_feature_view(name="transactions_view", query=query) # create a feature view with transformation and label feature_view = fs.create_feature_view( - name='transactions_view', + name="transactions_view", query=query, labels=["fraud_label"], transformation_functions={ - "amount": fs.get_transformation_function(name="standard_scaler", version=1) - } + "amount": fs.get_transformation_function( + name="standard_scaler", version=1 + ) + }, ) ``` @@ -61,10 +60,13 @@ To see a full example of how to create a feature view, you can read [this notebo Once you have created a feature view, you can retrieve it by its name and version. === "Python" + ```python feature_view = fs.get_feature_view(name="transactions_view", version=1) ``` + === "Java" + ```java FeatureView featureView = featureStore.getFeatureView("transactions_view", 1) ``` @@ -75,10 +77,13 @@ If there are some feature view instances which you do not use anymore, you can d It is important to mention that all training datasets (include all materialised hopsfs training data) will be deleted along with the feature view. === "Python" + ```python feature_view.delete() ``` + === "Java" + ```java featureView.delete() ``` @@ -90,6 +95,7 @@ You can attach, get, and remove tags. You can learn more in [Tags Guide](../tags/tags.md). === "Python" + ```python # attach feature_view.add_tag(name="tag_schema", value={"key": "value"}) @@ -97,10 +103,12 @@ You can learn more in [Tags Guide](../tags/tags.md). # get feature_view.get_tag(name="tag_schema") - #remove + # remove feature_view.delete_tag(name="tag_schema") ``` + === "Java" + ```java // attach Map tag = Maps.newHashMap(); diff --git a/docs/user_guides/fs/feature_view/query.md b/docs/user_guides/fs/feature_view/query.md index 5683b4a759..8ea0289256 100644 --- a/docs/user_guides/fs/feature_view/query.md +++ b/docs/user_guides/fs/feature_view/query.md @@ -11,6 +11,7 @@ The joining functionality is heavily inspired by the APIs used by Pandas to merg The APIs allow you to specify which features to select from which feature group, how to join them and which features to use in join conditions. === "Python" + ```python fs = ... credit_card_transactions_fg = fs.get_feature_group(name="credit_card_transactions", version=1) @@ -36,8 +37,8 @@ The APIs allow you to specify which features to select from which feature group, ``` === "Scala" - ```scala + ```scala val fs = ... val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions", 1) val accountDetailsFg = fs.getFeatureGroup(name="account_details", version=1) @@ -71,15 +72,19 @@ Most operations performed on `FeatureGroup` metadata objects will return a `Quer Selecting features from a feature group is a lazy operation, returning a query with the selected features only: === "Python" + ```python credit_card_transactions_fg = fs.get_feature_group("credit_card_transactions") # Returns Query - selected_features = credit_card_transactions_fg.select(["amount", "latitude", "longitude"]) + selected_features = credit_card_transactions_fg.select( + ["amount", "latitude", "longitude"] + ) ``` === "Scala" - ```Scala + + ```scala val creditCardTransactionsFg = fs.getFeatureGroup("credit_card_transactions") # Returns Query @@ -93,13 +98,15 @@ The simplest join in one where we join all of the features together from two dif By default, Hopsworks will use the maximal matching subset of the primary keys of the two feature groups as joining key(s), if not specified otherwise. === "Python" + ```python # Returns Query selected_features = credit_card_transactions_fg.join(account_details_fg) ``` === "Scala" - ```Scala + + ```scala // Returns Query val selectedFeatures = creditCardTransactionsFg.join(accountDetailsFg) ``` @@ -111,13 +118,22 @@ features for the join key of the left and right feature group. The join key lists should contain the names of the features to join on. === "Python" + ```python - selected_features = credit_card_transactions_fg.select_all() \ - .join(account_details_fg.select_all(), on=["cc_num"]) \ - .join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"], join_type="inner") + selected_features = ( + credit_card_transactions_fg.select_all() + .join(account_details_fg.select_all(), on=["cc_num"]) + .join( + merchant_details_fg.select_all(), + left_on=["merchant_id"], + right_on=["id"], + join_type="inner", + ) + ) ``` === "Scala" + ```scala val selectedFeatures = (creditCardTransactionsFg.selectAll() .join(accountDetailsFg.selectAll(), Seq("cc_num")) @@ -146,6 +162,7 @@ foreign keys for its child feature groups.

=== "Python" + ```python selected_features = credit_card_transactions.select_all() .join(aggregated_cc_transactions.select_all()) @@ -158,6 +175,7 @@ In online inference, when you want to retrieve features in your online model, yo known as the serving_keys, from the parent feature group to retrieve your precomputed feature values using the feature view. === "Python" + ```python feature vector = feature_view.get_feature_vector({ ‘cc_num’: “1234 5555 3333 8888”, @@ -180,6 +198,7 @@ This is called Snowflake Schema data model where you need to build nested table

=== "Python" + ```python nested_selection = aggregated_cc_transactions.select_all() .join(account_details.select_all()) @@ -193,6 +212,7 @@ This is called Snowflake Schema data model where you need to build nested table Now, you have the benefit that in online inference you only need to pass two serving key values (the foreign keys of the leftmost feature group) to retrieve the precomputed features: === "Python" + ```python feature vector = feature_view.get_feature_vector({ ‘cc_num’: “1234 5555 3333 8888”, @@ -209,11 +229,15 @@ Bitwise Operators `&` and `|` are used to construct conjunctions. For the Scala part of the API, equivalent methods are available in the `Feature` and `Filter` classes. === "Python" + ```python - filtered_credit_card_transactions = credit_card_transactions_fg.filter(credit_card_transactions_fg.category == "Grocery") + filtered_credit_card_transactions = credit_card_transactions_fg.filter( + credit_card_transactions_fg.category == "Grocery" + ) ``` === "Scala" + ```scala val filteredCreditCardTransactions = creditCardTransactionsFg.filter(creditCardTransactionsFg.getFeature("category").eq("Grocery")) ``` @@ -221,14 +245,25 @@ For the Scala part of the API, equivalent methods are available in the `Feature` Filters are fully compatible with joins: === "Python" + ```python - selected_features = credit_card_transactions_fg.select_all() \ - .join(account_details_fg.select_all(), on=["cc_num"]) \ - .join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"]) \ - .filter((credit_card_transactions_fg.category == "Grocery") | (credit_card_transactions_fg.category == "Restaurant/Cafeteria")) + selected_features = ( + credit_card_transactions_fg.select_all() + .join(account_details_fg.select_all(), on=["cc_num"]) + .join( + merchant_details_fg.select_all(), + left_on=["merchant_id"], + right_on=["id"], + ) + .filter( + (credit_card_transactions_fg.category == "Grocery") + | (credit_card_transactions_fg.category == "Restaurant/Cafeteria") + ) + ) ``` === "Scala" + ```scala val selectedFeatures = (creditCardTransactionsFg.selectAll() .join(accountDetailsFg.selectAll(), Seq("cc_num")) @@ -239,14 +274,27 @@ Filters are fully compatible with joins: The filters can be applied at any point of the query: === "Python" + ```python - selected_features = credit_card_transactions_fg.select_all() \ - .join(accountDetails_fg.select_all().filter(accountDetails_fg.avg_temp >= 22), on=["cc_num"]) \ - .join(merchant_details_fg.select_all(), left_on=["merchant_id"], right_on=["id"]) \ + selected_features = ( + credit_card_transactions_fg.select_all() + .join( + accountDetails_fg.select_all().filter( + accountDetails_fg.avg_temp >= 22 + ), + on=["cc_num"], + ) + .join( + merchant_details_fg.select_all(), + left_on=["merchant_id"], + right_on=["id"], + ) .filter(credit_card_transactions_fg.category == "Grocery") + ) ``` === "Scala" + ```scala val selectedFeatures = (creditCardTransactionsFg.selectAll() .join(accountDetailsFg.selectAll().filter(accountDetailsFg.getFeature("avg_temp").ge(22)), Seq("cc_num")) @@ -261,6 +309,7 @@ However, this operation will not update the metadata and persist the updated que This query can then be used to create a new feature view. === "Python" + ```python fs = ... merchant_details_fg = fs.get_feature_group(name="merchant_details", version=1) @@ -272,6 +321,7 @@ This query can then be used to create a new feature view. ``` === "Scala" + ```scala val fs = ... val merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1) @@ -287,6 +337,7 @@ This query can then be used to create a new feature view. To successfully apply new join/filter logic it is recommended to refresh the query instance by re-fetching the feature view: === "Python" + ```python fs = ... @@ -311,6 +362,7 @@ This query can then be used to create a new feature view. ``` === "Scala" + ```scala fs = ... merchantDetailsFg = fs.getFeatureGroup("merchant_details", 1) diff --git a/docs/user_guides/fs/feature_view/spine-query.md b/docs/user_guides/fs/feature_view/spine-query.md index f156dd77f9..d2273b539c 100644 --- a/docs/user_guides/fs/feature_view/spine-query.md +++ b/docs/user_guides/fs/feature_view/spine-query.md @@ -21,8 +21,9 @@ The first step before creating a Feature View, is to construct the query by sele ```python # Select features for training data. -ds_query = trans_fg.select(["fraud_label"])\ - .join(window_aggs_fg.select_except(["cc_num"]), on="cc_num") +ds_query = trans_fg.select(["fraud_label"]).join( + window_aggs_fg.select_except(["cc_num"]), on="cc_num" +) ds_query.show(5) ``` @@ -39,14 +40,15 @@ trans_spine = fs.get_or_create_spine_group( name="spine_transactions", version=1, description="Transaction data", - primary_key=['cc_num'], - event_time='datetime', - dataframe=trans_df + primary_key=["cc_num"], + event_time="datetime", + dataframe=trans_df, ) # Select features for training data. -ds_query_spine = trans_spine.select(["fraud_label"])\ - .join(window_aggs_fg.select_except(["cc_num"]), on="cc_num") +ds_query_spine = trans_spine.select(["fraud_label"]).join( + window_aggs_fg.select_except(["cc_num"]), on="cc_num" +) ``` Calling the `show()` or `read()` method of this query object will use the spine dataframe included in the Spine Group object to perform the join. @@ -61,7 +63,7 @@ With the above defined query, we can continue to create the Feature View in the ```python feature_view_spine = fs.get_or_create_feature_view( - name='transactions_view_spine', + name="transactions_view_spine", query=ds_query_spine, version=1, labels=["fraud_label"], @@ -74,7 +76,9 @@ With the regular feature view, the labels are fetched from the feature store, bu Here you have the chance to pass a different set of entities to generate the training dataset. ```python -X_train, X_test, y_train, y_test = feature_view_spine.train_test_split(0.2, spine=new_entities_df) +X_train, X_test, y_train, y_test = feature_view_spine.train_test_split( + 0.2, spine=new_entities_df +) X_train.show() ``` diff --git a/docs/user_guides/fs/feature_view/training-data.md b/docs/user_guides/fs/feature_view/training-data.md index 33c83faef9..d57d452603 100644 --- a/docs/user_guides/fs/feature_view/training-data.md +++ b/docs/user_guides/fs/feature_view/training-data.md @@ -19,16 +19,16 @@ You can monitor the job status in the [jobs overview UI](../../projects/jobs/pys ```python # create a training dataset as dataframe feature_df, label_df = feature_view.training_data( - description = 'transactions fraud batch training dataset', + description="transactions fraud batch training dataset", ) # materialise a training dataset version, job = feature_view.create_training_data( - description = 'transactions fraud batch training dataset', - data_format = 'csv', - write_options = {"wait_for_job": False} -) # By default, it is materialised to HopsFS -print(job.id) # get the job's id and view the job status in the UI + description="transactions fraud batch training dataset", + data_format="csv", + write_options={"wait_for_job": False}, +) # By default, it is materialised to HopsFS +print(job.id) # get the job's id and view the job status in the UI ``` ### Extra filters @@ -44,13 +44,14 @@ Examples below show how to create training data for different transaction catego ```python # Create a training dataset for Health/Beauty df_health = feature_view.training_data( - description = 'transactions fraud batch training dataset for Health/Beauty', - extra_filter = trans_fg.category == "Health/Beauty" + description="transactions fraud batch training dataset for Health/Beauty", + extra_filter=trans_fg.category == "Health/Beauty", ) # Create a training dataset for Restaurant/Cafeteria and Holliday/Travel df_restaurant_travel = feature_view.training_data( - description = 'transactions fraud batch training dataset for Restaurant/Cafeteria and Holliday/Travel', - extra_filter = trans_fg.category == "Restaurant/Cafeteria" and trans_fg.category == "Holliday/Travel" + description="transactions fraud batch training dataset for Restaurant/Cafeteria and Holliday/Travel", + extra_filter=trans_fg.category == "Restaurant/Cafeteria" + and trans_fg.category == "Holliday/Travel", ) ``` @@ -67,9 +68,9 @@ X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2) # materialise a training dataset version, job = feature_view.create_train_test_split( - test_size = 0.2, - description = 'transactions fraud batch training dataset', - data_format = 'csv' + test_size=0.2, + description="transactions fraud batch training dataset", + data_format="csv", ) ``` @@ -77,14 +78,18 @@ Create a training dataset (as in-memory DataFrames) or materialise a training da ```python # create a training dataset as DataFrame -X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(validation_size=0.3, test_size=0.2) +X_train, X_val, X_test, y_train, y_val, y_test = ( + feature_view.train_validation_test_split( + validation_size=0.3, test_size=0.2 + ) +) # materialise a training dataset version, job = feature_view.create_train_validation_test_split( - validation_size = 0.3, - test_size = 0.2, - description = 'transactions fraud batch training dataset', - data_format = 'csv' + validation_size=0.3, + test_size=0.2, + description="transactions fraud batch training dataset", + data_format="csv", ) ``` @@ -93,7 +98,9 @@ and you want to create a particular in-memory training dataset with Hive instead ```python # create a training dataset as DataFrame with Hive -X_train, X_test, y_train, y_test = feature_view.train_test_split(test_size=0.2, read_options={"use_hive": True}) +X_train, X_test, y_train, y_test = feature_view.train_test_split( + test_size=0.2, read_options={"use_hive": True} +) ``` ## Read Training Data @@ -105,13 +112,19 @@ That is, you can delete the training data files (for example, to reduce storage ```python # get a training dataset -feature_df, label_df = feature_view.get_training_data(training_dataset_version=1) +feature_df, label_df = feature_view.get_training_data( + training_dataset_version=1 +) # get a training dataset with train and test splits -X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1) +X_train, X_test, y_train, y_test = feature_view.get_train_test_split( + training_dataset_version=1 +) # get a training dataset with train, validation and test splits -X_train, X_val, X_test, y_train, y_val, y_test = feature_view.get_train_validation_test_split(training_dataset_version=1) +X_train, X_val, X_test, y_train, y_val, y_test = ( + feature_view.get_train_validation_test_split(training_dataset_version=1) +) ``` ## Passing Context Variables to Transformation Functions @@ -121,20 +134,26 @@ Once you have [defined a transformation function using a context variable](../tr !!! note Passing context variables for materializing a training dataset is only supported in the PySpark Kernel. -=== "Python" - !!! example "Passing context variables while creating training data." +!!! example "Passing context variables while creating training data." + === "Python" + ```python # Passing context variable to IN-MEMORY Training Dataset. - X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1, - primary_key=True, - event_time=True, - transformation_context={"context_parameter":10}) + X_train, X_test, y_train, y_test = feature_view.get_train_test_split( + training_dataset_version=1, + primary_key=True, + event_time=True, + transformation_context={"context_parameter": 10}, + ) # Passing context variable to Materialized Training Dataset. - version, job = feature_view.get_train_test_split(training_dataset_version=1, - primary_key=True, - event_time=True, - transformation_context={"context_parameter":10}) + version, job = feature_view.get_train_test_split( + training_dataset_version=1, + primary_key=True, + event_time=True, + transformation_context={"context_parameter": 10}, + ) + ``` @@ -146,9 +165,11 @@ To retrieve the primary key(s) and/or event time when retrieving training data, ```python # get a training dataset -X_train, X_test, y_train, y_test = feature_view.get_train_test_split(training_dataset_version=1, - primary_key=True, - event_time=True) +X_train, X_test, y_train, y_test = feature_view.get_train_test_split( + training_dataset_version=1, + primary_key=True, + event_time=True, +) ``` !!! note @@ -186,7 +207,7 @@ feature_view.purge_all_training_data() To recreate a training dataset: ```python -feature_view.recreate_training_dataset(training_dataset_version =1) +feature_view.recreate_training_dataset(training_dataset_version=1) ``` ## Tags @@ -197,16 +218,18 @@ You can learn more in [Tags Guide](../tags/tags.md). ```python # attach feature_view.add_training_dataset_tag( - training_dataset_version=1, - name="tag_schema", - value={"key": "value"} + training_dataset_version=1, name="tag_schema", value={"key": "value"} ) # get -feature_view.get_training_dataset_tag(training_dataset_version=1, name="tag_schema") +feature_view.get_training_dataset_tag( + training_dataset_version=1, name="tag_schema" +) -#remove -feature_view.delete_training_dataset_tag(training_dataset_version=1, name="tag_schema") +# remove +feature_view.delete_training_dataset_tag( + training_dataset_version=1, name="tag_schema" +) ``` ## Next diff --git a/docs/user_guides/fs/provenance/provenance.md b/docs/user_guides/fs/provenance/provenance.md index 03626c8e00..5584db369b 100644 --- a/docs/user_guides/fs/provenance/provenance.md +++ b/docs/user_guides/fs/provenance/provenance.md @@ -40,9 +40,7 @@ You can inspect the relationship between data sources and feature groups using t # Create the user profiles feature group user_profiles_fg = fs.create_external_feature_group( - name="user_profiles", - version=1, - data_source=ds + name="user_profiles", version=1, data_source=ds ) user_profiles_fg.save() ``` @@ -114,7 +112,9 @@ You can mark the external feature group as parent of the feature group you are c # Do feature engineering age_df = transaction_df.merge(profiles_fg.read(), on="cc_num", how="left") - transaction_df["age_at_transaction"] = (age_df["datetime"] - age_df["birthdate"]) / np.timedelta64(1, "Y") + transaction_df["age_at_transaction"] = ( + age_df["datetime"] - age_df["birthdate"] + ) / np.timedelta64(1, "Y") # Create the transaction feature group transaction_fg = fs.get_or_create_feature_group( @@ -123,7 +123,7 @@ You can mark the external feature group as parent of the feature group you are c description="Transaction features", primary_key=["cc_num"], event_time="datetime", - parents=[profiles_fg] + parents=[profiles_fg], ) transaction_fg.insert(transaction_df) ``` @@ -138,10 +138,12 @@ Another example use case for derived feature group is if you have a feature grou daily_transaction_df = daily_transaction_fg.read() # Do feature engineering - cc_group = daily_transaction_df[["cc_num", "amount", "datetime"]] \ - .groupby("cc_num") \ - .rolling("1M", on="datetime") - monthly_transaction_df = pd.DataFrame(cc_group.mean()) + cc_group = ( + daily_transaction_df[["cc_num", "amount", "datetime"]] + .groupby("cc_num") + .rolling("1M", on="datetime") + ) + monthly_transaction_df = pd.DataFrame(cc_group.mean()) # Create the transaction feature group monthly_transaction_fg = fs.get_or_create_feature_group( @@ -150,7 +152,7 @@ Another example use case for derived feature group is if you have a feature grou description="Transaction features - monthly aggregates", primary_key=["cc_num"], event_time="datetime", - parents=[daily_transaction_fg] + parents=[daily_transaction_fg], ) monthly_transaction_fg.insert(monthly_transaction_df) ``` diff --git a/docs/user_guides/fs/sharing/sharing.md b/docs/user_guides/fs/sharing/sharing.md index 23cc6a1f35..1751fcbbf1 100644 --- a/docs/user_guides/fs/sharing/sharing.md +++ b/docs/user_guides/fs/sharing/sharing.md @@ -145,13 +145,11 @@ shared_feature_store = project.get_feature_store(name="name_of_shared_feature_st ```python # Fetch a feature group from the shared feature store shared_fg = shared_feature_store.get_feature_group( - name="shared_fg_name", - version=1 + name="shared_fg_name", version=1 ) # Fetch a feature group from your project's feature store fg = project_feature_store.get_or_create_feature_group( - name="feature_group_name", - version=1 + name="feature_group_name", version=1 ) ``` diff --git a/docs/user_guides/fs/tags/tags.md b/docs/user_guides/fs/tags/tags.md index 10c43e7636..f51ccceee4 100644 --- a/docs/user_guides/fs/tags/tags.md +++ b/docs/user_guides/fs/tags/tags.md @@ -87,9 +87,9 @@ You can attach tags to feature groups and feature views by using the `add_tag()` # Define the tag tag = { - 'business_unit': 'Fraud', - 'data_owner': 'email@hopsworks.ai', - 'pii': True + "business_unit": "Fraud", + "data_owner": "email@hopsworks.ai", + "pii": True, } # Attach the tag diff --git a/docs/user_guides/fs/transformation_functions.md b/docs/user_guides/fs/transformation_functions.md index 5466e5c969..413fc09dc7 100644 --- a/docs/user_guides/fs/transformation_functions.md +++ b/docs/user_guides/fs/transformation_functions.md @@ -73,12 +73,13 @@ Hopsworks supports four types of transformation functions across all execution m To create a one-to-one transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a single Python type. The transformation function should take one argument as input and return a Pandas Series. -=== "Python" +!!! example "Creation of a one-to-one transformation function in Hopsworks." + === "Python" - !!! example "Creation of a one-to-one transformation function in Hopsworks." ```python from hopsworks import udf + @udf(return_type=int) def add_one(feature): return feature + 1 @@ -88,11 +89,13 @@ The transformation function should take one argument as input and return a Panda The creation of many-to-one transformation functions is similar to that of a one-to-one transformation function, the only difference being that the transformation function accepts multiple features as input. -=== "Python" - !!! example "Creation of a many-to-one transformation function in Hopsworks." +!!! example "Creation of a many-to-one transformation function in Hopsworks." + === "Python" + ```python from hopsworks import udf + @udf(return_type=int) def add_features(feature1, feature2, feature3): return feature1 + feature2 + feature3 @@ -103,11 +106,12 @@ The creation of many-to-one transformation functions is similar to that of a one To create a one-to-many transformation function, the Hopsworks `@udf` decorator must be provided with the `return_type` as a list of Python types, and the transformation function should take one argument as input and return multiple features as a Pandas DataFrame. The return types provided to the decorator must match the types of each column in the returned Pandas DataFrame. -=== "Python" - !!! example "Creation of a one-to-many transformation function in Hopsworks." +!!! example "Creation of a one-to-many transformation function in Hopsworks." + === "Python" + ```python from hopsworks import udf - import pandas as pd + @udf(return_type=[int, int]) def add_one_and_two(feature1): @@ -118,11 +122,12 @@ The return types provided to the decorator must match the types of each column i The creation of a many-to-many transformation function is similar to that of a one-to-many transformation function, the only difference being that the transformation function accepts multiple features as input. -=== "Python" - !!! example "Creation of a many-to-many transformation function in Hopsworks." +!!! example "Creation of a many-to-many transformation function in Hopsworks." + === "Python" + ```python from hopsworks import udf - import pandas as pd + @udf(return_type=[int, int, int]) def add_one_multiple(feature1, feature2, feature3): @@ -134,57 +139,69 @@ The creation of a many-to-many transformation function is similar to that of a o The `mode` parameter of the `@udf` decorator can be used to specify the execution mode of the transformation function. It accepts three possible values `default`, `python` and `pandas`. Each mode is explained in more detail below: -#### Default +#### Default Mode This execution mode assumes that the transformation function can be executed as either a Pandas UDF or a Python UDF. It serves as the default mode used when the `mode` parameter is not specified. In this mode, the transformation function is executed as a Pandas UDF during training and in the batch inference pipeline, while it operates as a Python UDF during online inference. -=== "Python" - !!! example "Creating a many to many transformations function using the default execution mode" +!!! example "Creating a many to many transformations function using the default execution mode" + === "Python" + ```python from hopsworks import udf - import pandas as pd + # "default" mode is used if the parameter `mode` is not explicitly set. @udf(return_type=[int, int, int]) def add_one_multiple(feature1, feature2, feature3): return feature1 + 1, feature2 + 1, feature3 + 1 + @udf(return_type=[int, int, int], mode="default") def add_two_multiple(feature1, feature2, feature3): return feature1 + 2, feature2 + 2, feature3 + 2 ``` -#### Python +#### Python Mode The transformation function can be configured to always execute as a Python UDF by setting the `mode` parameter of the `@udf` decorator to `python`. -=== "Python" - !!! example "Creating a many to many transformation function as a Python UDF" +!!! example "Creating a many to many transformation function as a Python UDF" + === "Python" + ```python from hopsworks import udf - import pandas as pd - @udf(return_type=[int, int, int], mode = "python") + + @udf(return_type=[int, int, int], mode="python") def add_one_multiple(feature1, feature2, feature3): return feature1 + 1, feature2 + 1, feature3 + 1 ``` -#### Pandas +#### Pandas Mode The transformation function can be configured to always execute as a Pandas UDF by setting the `mode` parameter of the `@udf` decorator to `pandas`. -=== "Python" - !!! example "Creating a many to many transformations function as a Pandas UDF" +!!! example "Creating a many to many transformations function as a Pandas UDF" + === "Python" + ```python - from hopsworks import udf import pandas as pd + from hopsworks import udf + # A Pandas UDF returning a Pandas DataFrame - @udf(return_type=[int, int, int], mode = "pandas") + @udf(return_type=[int, int, int], mode="pandas") def add_one_multiple(feature1, feature2, feature3): - return pd.DataFrame({"add_one_feature1":feature1 + 1, "add_one_feature2":feature2 + 1, "add_one_feature3":feature3 + 1}) + return pd.DataFrame( + { + "add_one_feature1": feature1 + 1, + "add_one_feature2": feature2 + 1, + "add_one_feature3": feature3 + 1, + } + ) + # A Pandas UDF returning multiple Pandas Series @udf(return_type=[int, int, int], mode="pandas") @@ -197,11 +214,12 @@ The transformation function can be configured to always execute as a Pandas UDF The `drop` parameter of the `@udf` decorator is used to drop specific columns in the input DataFrame after transformation. If any argument of the transformation function is passed to the `drop` parameter, then the column mapped to the argument is dropped after the transformation functions are applied. In the example below, the columns mapped to the arguments `feature1` and `feature3` are dropped after the application of all transformation functions. -=== "Python" - !!! example "Specify arguments to drop after transformation" +!!! example "Specify arguments to drop after transformation" + === "Python" + ```python from hopsworks import udf - import pandas as pd + @udf(return_type=[int, int, int], drop=["feature1", "feature3"]) def add_one_multiple(feature1, feature2, feature3): @@ -214,18 +232,22 @@ The [`TransformationFunction.alias`][hsfs.transformation_function.Transformation Each name must be uniques and should be at-most 63 characters long. If no name is provided via the `alias` function, Hopsworks generates default output feature names when [on-demand](./feature_group/on_demand_transformations.md) or [model-dependent](./feature_view/model-dependent-transformations.md) transformation functions are created. -=== "Python" - !!! example "Specifying output column names for transformation functions." +!!! example "Specifying output column names for transformation functions." + === "Python" + ```python from hopsworks import udf - import pandas as pd + @udf(return_type=[int, int, int], drop=["feature1", "feature3"]) def add_one_multiple(feature1, feature2, feature3): return feature1 + 1, feature2 + 1, feature3 + 1 + # Specifying output feature names of the transformation function. - add_one_multiple.alias("transformed_feature1", "transformed_feature2", "transformed_feature3") + add_one_multiple.alias( + "transformed_feature1", "transformed_feature2", "transformed_feature3" + ) ``` ### Training dataset statistics @@ -243,17 +265,26 @@ The `TransformationStatistics` instance contains separate objects with the sam These objects encapsulate statistics related to the argument as instances of the class [`FeatureTransformationStatistics`][hsfs.transformation_statistics.FeatureTransformationStatistics]. Upon instantiation, instances of `FeatureTransformationStatistics` contain `None` values and are updated with the required statistics after the creation of a training dataset. -=== "Python" - !!! example "Creation of a transformation function in Hopsworks that uses training dataset statistics" +!!! example "Creation of a transformation function in Hopsworks that uses training dataset statistics" + === "Python" + ```python from hopsworks import udf from hopsworks.transformation_statistics import TransformationStatistics stats = TransformationStatistics("argument1", "argument2", "argument3") + @udf(int) def add_features(argument1, argument2, argument3, statistics=stats): - return argument1 + argument2 + argument3 + statistics.argument1.mean + statistics.argument2.mean + statistics.argument3.mean + return ( + argument1 + + argument2 + + argument3 + + statistics.argument1.mean + + statistics.argument2.mean + + statistics.argument3.mean + ) ``` ### Passing context variables to transformation function @@ -262,11 +293,13 @@ The `context` keyword argument can be defined in a transformation function to ac These variables contain common data used across transformation functions. By including the context argument, you can pass the necessary data as a dictionary into the into the `context` argument of the transformation function during [training dataset creation](feature_view/training-data.md#passing-context-variables-to-transformation-functions) or [feature vector retrieval](feature_view/feature-vectors.md#passing-context-variables-to-transformation-functions) or [batch data retrieval](feature_view/batch-data.md#passing-context-variables-to-transformation-functions). -=== "Python" - !!! example "Creation of a transformation function in Hopsworks that accepts context variables" +!!! example "Creation of a transformation function in Hopsworks that accepts context variables" + === "Python" + ```python from hopsworks import udf + @udf(int) def add_features(argument1, context): return argument1 + context["value_to_add"] @@ -277,13 +310,13 @@ By including the context argument, you can pass the necessary data as a dictiona To save a transformation function to the feature store, use the function `create_transformation_function`. It creates a [`TransformationFunction`][hsfs.transformation_function.TransformationFunction] object which can then be saved by calling the save function. The save function will throw an error if another transformation function with the same name and version is already saved in the feature store. -=== "Python" +!!! example "Register transformation function `add_one` in the Hopsworks feature store" + === "Python" - !!! example "Register transformation function `add_one` in the Hopsworks feature store" ```python plus_one_meta = fs.create_transformation_function( - transformation_function=add_one, - version=1) + transformation_function=add_one, version=1 + ) plus_one_meta.save() ``` @@ -294,9 +327,9 @@ To retrieve all transformation functions from the feature store, use the functio A specific transformation function can be retrieved using its `name` and `version` with the function `get_transformation_function`. If only the `name` is provided, then the version will default to 1. -=== "Python" +!!! example "Retrieving transformation functions from the feature store" + === "Python" - !!! example "Retrieving transformation functions from the feature store" ```python # get all transformation functions fs.get_transformation_functions() diff --git a/docs/user_guides/integrations/databricks/api_key.md b/docs/user_guides/integrations/databricks/api_key.md index 659b14c0ac..810d507953 100644 --- a/docs/user_guides/integrations/databricks/api_key.md +++ b/docs/user_guides/integrations/databricks/api_key.md @@ -19,13 +19,14 @@ For the Databricks integration to work make sure you add the following scopes to ```python hl_lines="6" import hopsworks + project = hopsworks.login( - host='my_instance', # DNS of your Feature Store instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='my_project', # Name of your Hopsworks Feature Store project - api_key_value='apikey', # The API key to authenticate with Hopsworks + host="my_instance", # DNS of your Feature Store instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="my_project", # Name of your Hopsworks Feature Store project + api_key_value="apikey", # The API key to authenticate with Hopsworks ) - fs = project.get_feature_store() # Get the project's default feature store + fs = project.get_feature_store() # Get the project's default feature store ``` ## Next Steps diff --git a/docs/user_guides/integrations/databricks/configuration.md b/docs/user_guides/integrations/databricks/configuration.md index 6e8d8b1130..150de36172 100644 --- a/docs/user_guides/integrations/databricks/configuration.md +++ b/docs/user_guides/integrations/databricks/configuration.md @@ -101,13 +101,14 @@ Once the cluster is running users can establish a connection to the Hopsworks Fe ```python import hopsworks + project = hopsworks.login( - host='my_instance', # DNS of your Hopsworks instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='my_project', # Name of your Hopsworks project - api_key_value='apikey', # The API key to authenticate with Hopsworks + host="my_instance", # DNS of your Hopsworks instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="my_project", # Name of your Hopsworks project + api_key_value="apikey", # The API key to authenticate with Hopsworks ) -fs = project.get_feature_store() # Get the project's default feature store +fs = project.get_feature_store() # Get the project's default feature store ``` ## Next Steps diff --git a/docs/user_guides/integrations/emr/emr_configuration.md b/docs/user_guides/integrations/emr/emr_configuration.md index da25a94940..dc4d16bdf8 100644 --- a/docs/user_guides/integrations/emr/emr_configuration.md +++ b/docs/user_guides/integrations/emr/emr_configuration.md @@ -177,7 +177,6 @@ echo -n $(curl -H "Authorization: ApiKey ${API_KEY}" https://$HOST/hopsworks-api chmod -R o-rwx /usr/lib/hopsworks sudo pip3 install --upgrade hopsworks~=X.X.0 - ``` !!! attention "Matching Hopsworks version" diff --git a/docs/user_guides/integrations/hdinsight.md b/docs/user_guides/integrations/hdinsight.md index 8295b9329e..e0abb1f978 100644 --- a/docs/user_guides/integrations/hdinsight.md +++ b/docs/user_guides/integrations/hdinsight.md @@ -143,15 +143,15 @@ import hopsworks # Put the API key into Key Vault for any production setup: # See, https://azure.microsoft.com/en-us/services/key-vault/ -secret_value = 'MY_API_KEY' +secret_value = "MY_API_KEY" # Create a connection project = hopsworks.login( - host='MY_INSTANCE.cloud.hopsworks.ai', # DNS of your Feature Store instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='MY_PROJECT', # Name of your Hopsworks project - api_key_value=secret_value, # The API key to authenticate with Hopsworks - hostname_verification=True # Disable for self-signed certificates + host="MY_INSTANCE.cloud.hopsworks.ai", # DNS of your Feature Store instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="MY_PROJECT", # Name of your Hopsworks project + api_key_value=secret_value, # The API key to authenticate with Hopsworks + hostname_verification=True, # Disable for self-signed certificates ) # Get the feature store handle for the project's feature store diff --git a/docs/user_guides/integrations/mlstudio_designer.md b/docs/user_guides/integrations/mlstudio_designer.md index 5acd2c7a3b..de8bb8942c 100644 --- a/docs/user_guides/integrations/mlstudio_designer.md +++ b/docs/user_guides/integrations/mlstudio_designer.md @@ -53,38 +53,40 @@ In the pipeline, add a new `Execute Python Script` step and replace the Python s

```python -import os import importlib.util +import os - -package_name = 'hopsworks' -version = 'MY_VERSION' +package_name = "hopsworks" +version = "MY_VERSION" spec = importlib.util.find_spec(package_name) if spec is None: import os + os.system(f"pip install %s[python]==%s" % (package_name, version)) # Put the API key into Key Vault for any production setup: # See, https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-secrets-in-runs -#from azureml.core import Experiment, Run -#run = Run.get_context() -#secret_value = run.get_secret(name="fs-api-key") -secret_value = 'MY_API_KEY' +# from azureml.core import Experiment, Run +# run = Run.get_context() +# secret_value = run.get_secret(name="fs-api-key") +secret_value = "MY_API_KEY" + -def azureml_main(dataframe1 = None, dataframe2 = None): +def azureml_main(dataframe1=None, dataframe2=None): import hopsworks + project = hopsworks.login( - host='MY_INSTANCE.cloud.hopsworks.ai', # DNS of your Hopsworks instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='MY_PROJECT', # Name of your Hopsworks project - api_key_value=secret_value, # The API key to authenticate with Hopsworks - hostname_verification=True, # Disable for self-signed certificates - engine='python' # Choose python as engine + host="MY_INSTANCE.cloud.hopsworks.ai", # DNS of your Hopsworks instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="MY_PROJECT", # Name of your Hopsworks project + api_key_value=secret_value, # The API key to authenticate with Hopsworks + hostname_verification=True, # Disable for self-signed certificates + engine="python", # Choose python as engine ) - fs = project.get_feature_store() # Get the project's default feature store + fs = project.get_feature_store() # Get the project's default feature store - return fs.get_feature_group('MY_FEATURE_GROUP', version=1).read(), + return (fs.get_feature_group("MY_FEATURE_GROUP", version=1).read(),) ``` Select a compute target and save the step. diff --git a/docs/user_guides/integrations/mlstudio_notebooks.md b/docs/user_guides/integrations/mlstudio_notebooks.md index 182b1784f8..ac439bdd7c 100644 --- a/docs/user_guides/integrations/mlstudio_notebooks.md +++ b/docs/user_guides/integrations/mlstudio_notebooks.md @@ -64,19 +64,19 @@ import hopsworks # Put the API key into Key Vault for any production setup: # See, https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-secrets-in-runs -#from azureml.core import Experiment, Run -#run = Run.get_context() -#secret_value = run.get_secret(name="fs-api-key") -secret_value = 'MY_API_KEY' +# from azureml.core import Experiment, Run +# run = Run.get_context() +# secret_value = run.get_secret(name="fs-api-key") +secret_value = "MY_API_KEY" # Create a connection project = hopsworks.login( - host='MY_INSTANCE.cloud.hopsworks.ai', # DNS of your Hopsworks instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='MY_PROJECT', # Name of your Hopsworks project - api_key_value=secret_value, # The API key to authenticate with Hopsworks - hostname_verification=True, # Disable for self-signed certificates - engine='python' # Choose Python as engine + host="MY_INSTANCE.cloud.hopsworks.ai", # DNS of your Hopsworks instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="MY_PROJECT", # Name of your Hopsworks project + api_key_value=secret_value, # The API key to authenticate with Hopsworks + hostname_verification=True, # Disable for self-signed certificates + engine="python", # Choose Python as engine ) # Get the feature store handle for the project's feature store diff --git a/docs/user_guides/integrations/python.md b/docs/user_guides/integrations/python.md index 5e1bc2e66a..2ab241f06d 100644 --- a/docs/user_guides/integrations/python.md +++ b/docs/user_guides/integrations/python.md @@ -47,14 +47,15 @@ You are now ready to connect to Hopsworks from your Python environment: ```python import hopsworks + project = hopsworks.login( - host='my_instance', # DNS of your Hopsworks instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='my_project', # Name of your Hopsworks project - api_key_value='apikey', # The API key to authenticate with Hopsworks - engine='python', # Use the Python engine + host="my_instance", # DNS of your Hopsworks instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="my_project", # Name of your Hopsworks project + api_key_value="apikey", # The API key to authenticate with Hopsworks + engine="python", # Use the Python engine ) -fs = project.get_feature_store() # Get the project's default feature store +fs = project.get_feature_store() # Get the project's default feature store ``` !!! note "Engine" diff --git a/docs/user_guides/integrations/spark.md b/docs/user_guides/integrations/spark.md index 9dc7747381..18722ac0dd 100644 --- a/docs/user_guides/integrations/spark.md +++ b/docs/user_guides/integrations/spark.md @@ -83,14 +83,15 @@ You are now ready to connect to the Hopsworks Feature Store from Spark: ```python import hopsworks + project = hopsworks.login( - host='my_instance', # DNS of your Feature Store instance - port=443, # Port to reach your Hopsworks instance, defaults to 443 - project='my_project', # Name of your Hopsworks Feature Store project - api_key_value='api_key', # The API key to authenticate with the feature store - hostname_verification=True # Disable for self-signed certificates + host="my_instance", # DNS of your Feature Store instance + port=443, # Port to reach your Hopsworks instance, defaults to 443 + project="my_project", # Name of your Hopsworks Feature Store project + api_key_value="api_key", # The API key to authenticate with the feature store + hostname_verification=True, # Disable for self-signed certificates ) -fs = project.get_feature_store() # Get the project's default feature store +fs = project.get_feature_store() # Get the project's default feature store ``` !!! note "Engine" diff --git a/docs/user_guides/migration/40_migration.md b/docs/user_guides/migration/40_migration.md index 4bd5ef6e77..58268d31f6 100644 --- a/docs/user_guides/migration/40_migration.md +++ b/docs/user_guides/migration/40_migration.md @@ -43,6 +43,7 @@ With 4.0, On-Demand Transformation Functions are now better supported which has The following is how transformation functions were used in previous versions of Hopsworks and the how transformation functions are used in the 4.0 release. === "Pre-4.0" + ```python ################################################# # Creating transformation function Hopsworks 3.8# @@ -52,8 +53,10 @@ The following is how transformation functions were used in previous versions of def add_one(feature): return feature + 1 + # Create transformation function - add_one = fs.create_transformation_function(add_one, + add_one = fs.create_transformation_function( + add_one, output_type=int, version=1, ) @@ -69,18 +72,19 @@ The following is how transformation functions were used in previous versions of # Create feature view feature_view = fs.get_or_create_feature_view( - name='serving_fv', + name="serving_fv", version=1, query=selected_features, # Apply your custom transformation functions to the feature `feature_1` transformation_functions={ "feature_1": add_one, }, - labels=['target'], + labels=["target"], ) ``` === "4.0" + ```python ################################################# # Creating transformation function Hopsworks 4.0# @@ -91,16 +95,17 @@ The following is how transformation functions were used in previous versions of def add_one(feature): return feature + 1 + # Create feature view feature_view = fs.get_or_create_feature_view( - name='serving_fv', + name="serving_fv", version=1, query=selected_features, # Apply the custom transformation functions defined to the feature `feature_1` transformation_functions=[ add_one("feature_1"), ], - labels=['target'], + labels=["target"], ) ``` diff --git a/docs/user_guides/mlops/registry/frameworks/llm.md b/docs/user_guides/mlops/registry/frameworks/llm.md index 9d7edf1623..65bde785c5 100644 --- a/docs/user_guides/mlops/registry/frameworks/llm.md +++ b/docs/user_guides/mlops/registry/frameworks/llm.md @@ -13,6 +13,7 @@ In this guide you will learn how to export a [Large Language Model (LLM)](https: ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -28,14 +29,12 @@ Download your base or fine-tuned LLM. LLMs can typically be downloaded using the official frameworks provided by their creators (e.g., HuggingFace, Ollama, ...) === "Python" + ```python # Download LLM (e.g., using huggingface to download Llama-3.1-8B base model) from huggingface_hub import snapshot_download - model_dir = snapshot_download( - "meta-llama/Llama-3.1-8B", - ignore_patterns="original/*" - ) + model_dir = snapshot_download("meta-llama/Llama-3.1-8B", ignore_patterns="original/*") ``` ### Step 3: (Optional) Fine-tune LLM @@ -44,6 +43,7 @@ If necessary, fine-tune your LLM with an [instruction set](https://www.hopsworks A LLM can be fine-tuned fully or using [Parameter Efficient Fine Tuning (PEFT)](https://www.hopsworks.ai/dictionary/parameter-efficient-fine-tuning-of-llms) methods such as LoRA or QLoRA. === "Python" + ```python # Fine-tune LLM using PEFT (LoRA, QLoRA) or other methods model_dir = ... @@ -55,9 +55,10 @@ Use the `ModelRegistry.llm.create_model(..)` function to register a model as LLM Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" + ```python # Model evaluation metrics - metrics = {'f1-score': 0.8, 'perplexity': 31.62, 'bleu-score': 0.73} + metrics = {"f1-score": 0.8, "perplexity": 31.62, "bleu-score": 0.73} llm_model = mr.llm.create_model("llm_model", metrics=metrics) diff --git a/docs/user_guides/mlops/registry/frameworks/python.md b/docs/user_guides/mlops/registry/frameworks/python.md index a7e6ba89d2..e13274b13e 100644 --- a/docs/user_guides/mlops/registry/frameworks/python.md +++ b/docs/user_guides/mlops/registry/frameworks/python.md @@ -13,6 +13,7 @@ In this guide you will learn how to export a generic Python model and register i ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -27,6 +28,7 @@ In this guide you will learn how to export a generic Python model and register i Define your XGBoost model and run the training loop. === "Python" + ```python # Define a model model = XGBClassifier() @@ -40,6 +42,7 @@ Define your XGBoost model and run the training loop. Export the XGBoost model to a directory on the local filesystem. === "Python" + ```python model_file = "model.json" @@ -52,9 +55,10 @@ Use the `ModelRegistry.python.create_model(..)` function to register a model as Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" + ```python # Model evaluation metrics - metrics = {'accuracy': 0.92} + metrics = {"accuracy": 0.92} py_model = mr.python.create_model("py_model", metrics=metrics) diff --git a/docs/user_guides/mlops/registry/frameworks/skl.md b/docs/user_guides/mlops/registry/frameworks/skl.md index 7969a3050a..b15fa2ed64 100644 --- a/docs/user_guides/mlops/registry/frameworks/skl.md +++ b/docs/user_guides/mlops/registry/frameworks/skl.md @@ -13,6 +13,7 @@ In this guide you will learn how to export a Scikit-learn model and register it ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -27,6 +28,7 @@ In this guide you will learn how to export a Scikit-learn model and register it Define your Scikit-learn model and run the training loop. === "Python" + ```python # Define a model iris_knn = KNeighborsClassifier(..) @@ -39,6 +41,7 @@ Define your Scikit-learn model and run the training loop. Export the Scikit-learn model to a directory on the local filesystem. === "Python" + ```python model_file = "skl_knn.pkl" @@ -51,9 +54,10 @@ Use the `ModelRegistry.sklearn.create_model(..)` function to register a model as Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" + ```python # Model evaluation metrics - metrics = {'accuracy': 0.92} + metrics = {"accuracy": 0.92} skl_model = mr.sklearn.create_model("skl_model", metrics=metrics) diff --git a/docs/user_guides/mlops/registry/frameworks/tch.md b/docs/user_guides/mlops/registry/frameworks/tch.md index 7e0a2aa52f..f83831f6a2 100644 --- a/docs/user_guides/mlops/registry/frameworks/tch.md +++ b/docs/user_guides/mlops/registry/frameworks/tch.md @@ -13,6 +13,7 @@ In this guide you will learn how to export a Torch model and register it in the ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -27,6 +28,7 @@ In this guide you will learn how to export a Torch model and register it in the Define your Torch model and run the training loop. === "Python" + ```python # Define the model architecture class Net(nn.Module): @@ -40,6 +42,7 @@ Define your Torch model and run the training loop. ... return x + # Instantiate the model net = Net() @@ -53,6 +56,7 @@ Define your Torch model and run the training loop. Export the Torch model to a directory on the local filesystem. === "Python" + ```python model_dir = "./model" @@ -65,9 +69,10 @@ Use the `ModelRegistry.torch.create_model(..)` function to register a model as a Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" + ```python # Model evaluation metrics - metrics = {'accuracy': 0.92} + metrics = {"accuracy": 0.92} tch_model = mr.torch.create_model("tch_model", metrics=metrics) diff --git a/docs/user_guides/mlops/registry/frameworks/tf.md b/docs/user_guides/mlops/registry/frameworks/tf.md index 5de1535443..f8ce572129 100644 --- a/docs/user_guides/mlops/registry/frameworks/tf.md +++ b/docs/user_guides/mlops/registry/frameworks/tf.md @@ -16,6 +16,7 @@ In this guide you will learn how to export a TensorFlow model and register it in ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -30,6 +31,7 @@ In this guide you will learn how to export a TensorFlow model and register it in Define your TensorFlow model and run the training loop. === "Python" + ```python # Define a model model = tf.keras.Sequential() @@ -49,6 +51,7 @@ Define your TensorFlow model and run the training loop. Export the TensorFlow model to a directory on the local filesystem. === "Python" + ```python model_dir = "./model" @@ -61,9 +64,10 @@ Use the `ModelRegistry.tensorflow.create_model(..)` function to register a model Define a name, and attach optional metrics for your model, then invoke the `save()` function with the parameter being the path to the local directory where the model was exported to. === "Python" + ```python # Model evaluation metrics - metrics = {'accuracy': 0.92} + metrics = {"accuracy": 0.92} tf_model = mr.tensorflow.create_model("tf_model", metrics=metrics) diff --git a/docs/user_guides/mlops/registry/input_example.md b/docs/user_guides/mlops/registry/input_example.md index 86908443f9..24ddf1f88c 100644 --- a/docs/user_guides/mlops/registry/input_example.md +++ b/docs/user_guides/mlops/registry/input_example.md @@ -15,6 +15,7 @@ Attaching an input example to your model will give other users a better understa ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -30,6 +31,7 @@ Generate an input example which corresponds to a valid input to your model. Currently we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list` to be passed as input example. === "Python" + ```python import numpy as np @@ -41,8 +43,8 @@ Currently we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list` to b Set the `input_example` parameter in the `create_model` function and call `save()` to attaching it to the model and register it in the registry. === "Python" + ```python - model = mr.tensorflow.create_model(name="mnist", - input_example=input_example) + model = mr.tensorflow.create_model(name="mnist", input_example=input_example) model.save("./model") ``` diff --git a/docs/user_guides/mlops/registry/model_evaluation_images.md b/docs/user_guides/mlops/registry/model_evaluation_images.md index cf166bb15c..036ff9cad2 100644 --- a/docs/user_guides/mlops/registry/model_evaluation_images.md +++ b/docs/user_guides/mlops/registry/model_evaluation_images.md @@ -16,6 +16,7 @@ By attaching model evaluation images to your versioned model, other users can be ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -30,6 +31,7 @@ By attaching model evaluation images to your versioned model, other users can be Generate an image that visualizes model performance and evaluation metrics === "Python" + ```python import seaborn from sklearn.metrics import confusion_matrix @@ -46,8 +48,8 @@ Generate an image that visualizes model performance and evaluation metrics # Create a DataFrame for the confusion matrix results df_confusion_matrix = pd.DataFrame( results, - ['True Normal', 'True Fraud'], - ['Pred Normal', 'Pred Fraud'], + ["True Normal", "True Fraud"], + ["Pred Normal", "Pred Fraud"], ) # Create a heatmap using seaborn with annotations @@ -63,6 +65,7 @@ Generate an image that visualizes model performance and evaluation metrics Save the figure to a file with a common filename extension (for example, .png or .jpeg), and place it in a directory called `images` - a subdirectory of the model directory that is registered to Hopsworks. === "Python" + ```python # Specify the directory name for saving the model and related artifacts model_dir = "./model" diff --git a/docs/user_guides/mlops/registry/model_schema.md b/docs/user_guides/mlops/registry/model_schema.md index e7c8fe5c53..e8d4de29c2 100644 --- a/docs/user_guides/mlops/registry/model_schema.md +++ b/docs/user_guides/mlops/registry/model_schema.md @@ -15,6 +15,7 @@ Attaching a model schema to your model will give other users a better understand ### Step 1: Connect to Hopsworks === "Python" + ```python import hopsworks @@ -30,19 +31,26 @@ Create a ModelSchema for your inputs and outputs by passing in an example that y Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`. === "Python" + ```python # Import a Schema and ModelSchema definition from hsml.utils.model_schema import ModelSchema from hsml.utils.schema import Schema # Model inputs for MNIST dataset - inputs = [{'type': 'uint8', 'shape': [28, 28, 1], 'description': 'grayscale representation of 28x28 MNIST images'}] + inputs = [ + { + "type": "uint8", + "shape": [28, 28, 1], + "description": "grayscale representation of 28x28 MNIST images", + } + ] # Build the input schema input_schema = Schema(inputs) # Model outputs - outputs = [{'type': 'float32', 'shape': [10]}] + outputs = [{"type": "float32", "shape": [10]}] # Build the output schema output_schema = Schema(outputs) @@ -56,8 +64,8 @@ Currently, we support `pandas.DataFrame, pandas.Series, numpy.ndarray, list`. Set the `model_schema` parameter in the `create_model` function and call `save()` to attaching it to the model and register it in the registry. === "Python" + ```python - model = mr.tensorflow.create_model(name="mnist", - model_schema=model_schema) + model = mr.tensorflow.create_model(name="mnist", model_schema=model_schema) model.save("./model") ``` diff --git a/docs/user_guides/mlops/serving/api-protocol.md b/docs/user_guides/mlops/serving/api-protocol.md index cea116f268..b5f11e8989 100644 --- a/docs/user_guides/mlops/serving/api-protocol.md +++ b/docs/user_guides/mlops/serving/api-protocol.md @@ -93,7 +93,7 @@ Once you are done with the changes, click on `Create new deployment` at the bott my_predictor = ms.create_predictor( my_model, - api_protocol="GRPC" # defaults to "REST" + api_protocol="GRPC", # defaults to "REST" ) my_predictor.deploy() @@ -101,6 +101,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott my_deployment = ms.create_deployment(my_predictor) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/deployment-state.md b/docs/user_guides/mlops/serving/deployment-state.md index d730683983..7bc60e5815 100644 --- a/docs/user_guides/mlops/serving/deployment-state.md +++ b/docs/user_guides/mlops/serving/deployment-state.md @@ -85,6 +85,8 @@ Additionally, you can find the nº of instances currently running by scrolling d ```python deployment = ms.get_deployment("mydeployment") + + ``` ### Step 3: Inspect deployment state @@ -95,6 +97,8 @@ Additionally, you can find the nº of instances currently running by scrolling d state = deployment.get_state() state.describe() + + ``` ### Step 4: Check nº of running instances @@ -107,6 +111,8 @@ Additionally, you can find the nº of instances currently running by scrolling d # nº of transformer instances deployment.transformer.resources.describe() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/deployment.md b/docs/user_guides/mlops/serving/deployment.md index 488989143f..c514a0fd8f 100644 --- a/docs/user_guides/mlops/serving/deployment.md +++ b/docs/user_guides/mlops/serving/deployment.md @@ -158,6 +158,8 @@ Retrieve the trained model you want to deploy. ```python my_model = mr.get_model("my_model", version=1) + + ``` #### Option A: Using the model object @@ -166,6 +168,8 @@ Retrieve the trained model you want to deploy. ```python my_deployment = my_model.deploy() + + ``` #### Option B: Using the Model Serving handle @@ -182,6 +186,8 @@ Retrieve the trained model you want to deploy. # or my_deployment = ms.create_deployment(my_predictor) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/inference-batcher.md b/docs/user_guides/mlops/serving/inference-batcher.md index 8dc94ed582..d978b900d9 100644 --- a/docs/user_guides/mlops/serving/inference-batcher.md +++ b/docs/user_guides/mlops/serving/inference-batcher.md @@ -75,12 +75,13 @@ Once you are done with the changes, click on `Create new deployment` at the bott ```python from hsml.inference_batcher import InferenceBatcher - my_batcher = InferenceBatcher(enabled=True, - # optional - max_batch_size=32, - max_latency=5000, # milliseconds - timeout=5 # seconds - ) + my_batcher = InferenceBatcher( + enabled=True, + # optional + max_batch_size=32, + max_latency=5000, # milliseconds + timeout=5, # seconds + ) ``` ### Step 3: Create a deployment with the inference batcher @@ -88,18 +89,17 @@ Once you are done with the changes, click on `Create new deployment` at the bott === "Python" ```python - my_model = mr.get_model("my_model", version=1) - my_predictor = ms.create_predictor(my_model, - inference_batcher=my_batcher - ) + my_predictor = ms.create_predictor(my_model, inference_batcher=my_batcher) my_predictor.deploy() # or my_deployment = ms.create_deployment(my_predictor) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/inference-logger.md b/docs/user_guides/mlops/serving/inference-logger.md index 47f8ebaea3..64012324b7 100644 --- a/docs/user_guides/mlops/serving/inference-logger.md +++ b/docs/user_guides/mlops/serving/inference-logger.md @@ -78,15 +78,15 @@ Once you are done with the changes, click on `Create new deployment` at the bott === "Python" ```python - from hsml.inference_logger import InferenceLogger from hsml.kafka_topic import KafkaTopic - new_topic = KafkaTopic(name="CREATE", - # optional - num_partitions=1, - num_replicas=1 - ) + new_topic = KafkaTopic( + name="CREATE", + # optional + num_partitions=1, + num_replicas=1, + ) my_logger = InferenceLogger(kafka_topic=new_topic, mode="ALL") ``` @@ -95,7 +95,6 @@ Once you are done with the changes, click on `Create new deployment` at the bott Similarly, you can create the same logger with: ```python - my_logger = InferenceLogger(kafka_topic={"name": "CREATE"}, mode="ALL") ``` @@ -106,15 +105,15 @@ Once you are done with the changes, click on `Create new deployment` at the bott ```python my_model = mr.get_model("my_model", version=1) - my_predictor = ms.create_predictor(my_model, - inference_logger=my_logger - ) + my_predictor = ms.create_predictor(my_model, inference_logger=my_logger) my_predictor.deploy() # or my_deployment = ms.create_deployment(my_predictor) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/predictor.md b/docs/user_guides/mlops/serving/predictor.md index 5f77bce975..4470ffac75 100644 --- a/docs/user_guides/mlops/serving/predictor.md +++ b/docs/user_guides/mlops/serving/predictor.md @@ -171,37 +171,40 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 2 (Optional): Implement a predictor script === "Predictor" - ``` python - class Predictor(): + ``` python + class Predictor: def __init__(self): - """ Initialization code goes here""" + """Initialization code goes here""" # Model files can be found at os.environ["MODEL_FILES_PATH"] # self.model = ... # load your model def predict(self, inputs): - """ Serve predictions using the trained model""" + """Serve predictions using the trained model""" # Use the model to make predictions # return self.model.predict(inputs) ``` + === "Async Predictor" - ``` python - class Predictor(): + ``` python + class Predictor: def __init__(self): - """ Initialization code goes here""" + """Initialization code goes here""" # Model files can be found at os.environ["MODEL_FILES_PATH"] # self.model = ... # load your model async def predict(self, inputs): - """ Asynchronously serve predictions using the trained model""" + """Asynchronously serve predictions using the trained model""" # Perform async operations that required # result = await some_async_preprocessing(inputs) # Use the model to make predictions # return self.model.predict(result) ``` + === "Predictor (vLLM deployments only)" + ``` python import os from vllm import **version**, AsyncEngineArgs, AsyncLLMEngine @@ -269,8 +272,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott === "Python" ```python - uploaded_file_path = dataset_api.upload("my_predictor.py", "Resources", overwrite=True) - predictor_script_path = os.path.join("/Projects", project.name, uploaded_file_path) + uploaded_file_path = dataset_api.upload( + "my_predictor.py", "Resources", overwrite=True + ) + predictor_script_path = os.path.join( + "/Projects", project.name, uploaded_file_path + ) + + ``` ### Step 4: Define predictor @@ -280,12 +289,15 @@ Once you are done with the changes, click on `Create new deployment` at the bott ```python my_model = mr.get_model("my_model", version=1) - my_predictor = ms.create_predictor(my_model, - # optional - model_server="PYTHON", - serving_tool="KSERVE", - script_file=predictor_script_path - ) + my_predictor = ms.create_predictor( + my_model, + # optional + model_server="PYTHON", + serving_tool="KSERVE", + script_file=predictor_script_path, + ) + + ``` ### Step 5: Create a deployment with the predictor @@ -298,6 +310,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott # or my_deployment = ms.create_deployment(my_predictor) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/resources.md b/docs/user_guides/mlops/serving/resources.md index 32d99adbaa..2ba350d1a9 100644 --- a/docs/user_guides/mlops/serving/resources.md +++ b/docs/user_guides/mlops/serving/resources.md @@ -84,7 +84,9 @@ Once you are done with the changes, click on `Create new deployment` at the bott minimum_res = Resources(cores=1, memory=128, gpus=1) maximum_res = Resources(cores=2, memory=256, gpus=1) - predictor_res = PredictorResources(num_instances=1, requests=minimum_res, limits=maximum_res) + predictor_res = PredictorResources( + num_instances=1, requests=minimum_res, limits=maximum_res + ) ``` ### Step 3 (Optional): Define the transformer resource configuration @@ -97,7 +99,11 @@ Once you are done with the changes, click on `Create new deployment` at the bott minimum_res = Resources(cores=1, memory=128, gpus=1) maximum_res = Resources(cores=2, memory=256, gpus=1) - transformer_res = TransformerResources(num_instances=2, requests=minimum_res, limits=maximum_res) + transformer_res = TransformerResources( + num_instances=2, requests=minimum_res, limits=maximum_res + ) + + ``` ### Step 4: Create a deployment with the resource configuration @@ -107,17 +113,20 @@ Once you are done with the changes, click on `Create new deployment` at the bott ```python my_model = mr.get_model("my_model", version=1) - my_predictor = ms.create_predictor(my_model, - resources=predictor_res, - # transformer=Transformer(script_file, - # resources=transformer_res) - ) + my_predictor = ms.create_predictor( + my_model, + resources=predictor_res, + # transformer=Transformer(script_file, + # resources=transformer_res) + ) my_predictor.deploy() # or my_deployment = ms.create_deployment(my_predictor) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/rest-api.md b/docs/user_guides/mlops/serving/rest-api.md index 7d3df7456b..d7e99de1e1 100644 --- a/docs/user_guides/mlops/serving/rest-api.md +++ b/docs/user_guides/mlops/serving/rest-api.md @@ -48,38 +48,30 @@ The request must be sent as a JSON object containing an `inputs` or `instances` See [more information on the request format](https://kserve.github.io/website/docs/concepts/architecture/data-plane/v1-protocol#request-format). An example for this is given below. -=== "Python" +!!! example "REST API example for Predictive Inference (Tensorflow or SkLearn or Python Serving)" + === "Python" - !!! example "REST API example for Predictive Inference (Tensorflow or SkLearn or Python Serving)" ```python import requests - data = { - "inputs": [ - [ - 4641025220953719, - 4920355418495856 - ] - ] - } + data = {"inputs": [[4641025220953719, 4920355418495856]]} headers = { "Host": "fraud.test.hopsworks.ai", "Authorization": "ApiKey 8kDOlnRlJU4kiV1Y.RmFNJY3XKAUSqmJZ03kbUbXKMQSHveSBgMIGT84qrM5qXMjLib7hdlfGeg8fBQZp", - "Content-Type": "application/json" + "Content-Type": "application/json", } response = requests.post( - "http://10.87.42.108/v1/models/fraud:predict", - headers=headers, - json=data + "http://10.87.42.108/v1/models/fraud:predict", headers=headers, json=data ) print(response.json()) + + ``` -=== "Curl" + === "Curl" - !!! example "REST API example for Predictive Inference (Tensorflow or SkLearn or Python Serving)" ```bash curl -X POST "http://10.87.42.108/v1/models/fraud:predict" \ -H "Host: fraud.test.hopsworks.ai" \ diff --git a/docs/user_guides/mlops/serving/transformer.md b/docs/user_guides/mlops/serving/transformer.md index 607734b20a..9abf279d59 100644 --- a/docs/user_guides/mlops/serving/transformer.md +++ b/docs/user_guides/mlops/serving/transformer.md @@ -113,18 +113,19 @@ Once you are done with the changes, click on `Create new deployment` at the bott ### Step 2: Implement transformer script === "Transformer" + ```python - class Transformer(): + class Transformer: def __init__(self): - """ Initialization code goes here""" + """Initialization code goes here""" pass def preprocess(self, inputs): - """ Transform the requests inputs here. The object returned by this method will be used as model input to make predictions. """ + """Transform the requests inputs here. The object returned by this method will be used as model input to make predictions.""" return inputs def postprocess(self, outputs): - """ Transform the predictions computed by the model before returning a response """ + """Transform the predictions computed by the model before returning a response""" return outputs ``` @@ -138,8 +139,14 @@ Once you are done with the changes, click on `Create new deployment` at the bott === "Python" ```python - uploaded_file_path = dataset_api.upload("my_transformer.py", "Resources", overwrite=True) - transformer_script_path = os.path.join("/Projects", project.name, uploaded_file_path) + uploaded_file_path = dataset_api.upload( + "my_transformer.py", "Resources", overwrite=True + ) + transformer_script_path = os.path.join( + "/Projects", project.name, uploaded_file_path + ) + + ``` ### Step 4: Define a transformer @@ -154,6 +161,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott from hsml.transformer import Transformer my_transformer = Transformer(script_file) + + ``` ### Step 5: Create a deployment with the transformer @@ -167,6 +176,8 @@ Once you are done with the changes, click on `Create new deployment` at the bott # or my_deployment = ms.create_deployment(my_predictor, transformer=my_transformer) my_deployment.save() + + ``` ### API Reference diff --git a/docs/user_guides/mlops/serving/troubleshooting.md b/docs/user_guides/mlops/serving/troubleshooting.md index ba0ce1ac1e..f02d942ab8 100644 --- a/docs/user_guides/mlops/serving/troubleshooting.md +++ b/docs/user_guides/mlops/serving/troubleshooting.md @@ -134,6 +134,8 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f ```python deployment = ms.get_deployment("mydeployment") + + ``` ### Step 3: Get current deployment's predictor state @@ -144,6 +146,8 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f state = deployment.get_state() state.describe() + + ``` ### Step 4: Explore transient logs @@ -152,6 +156,8 @@ Once in the OpenSearch Dashboards, you can search for keywords, apply multiple f ```python deployment.get_logs(component="predictor|transformer", tail=10) + + ``` ### API Reference diff --git a/docs/user_guides/projects/airflow/airflow.md b/docs/user_guides/projects/airflow/airflow.md index 54883e12c1..8d2bb6f638 100644 --- a/docs/user_guides/projects/airflow/airflow.md +++ b/docs/user_guides/projects/airflow/airflow.md @@ -66,12 +66,14 @@ The Airflow DAGs are stored in the _Airflow_ dataset which you can access using When writing the code for the DAG you can invoke the operator as follows: ```python -HopsworksLaunchOperator(dag=dag, - task_id="profiles_fg_0", - project_name="airflow_doc", - job_name="profiles_fg", - job_arguments="", - wait_for_completion=True) +HopsworksLaunchOperator( + dag=dag, + task_id="profiles_fg_0", + project_name="airflow_doc", + job_name="profiles_fg", + job_arguments="", + wait_for_completion=True, +) ``` You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`, `job_arguments`). @@ -81,10 +83,12 @@ Similarly, you can invoke the sensor as shown below. You should provide the name of the Airflow task (`task_id`) and the Hopsworks job information (`project_name`, `job_name`) ```python -HopsworksJobSuccessSensor(dag=dag, - task_id='wait_for_profiles_fg', - project_name="airflow_doc", - job_name='profiles_fg') +HopsworksJobSuccessSensor( + dag=dag, + task_id="wait_for_profiles_fg", + project_name="airflow_doc", + job_name="profiles_fg", +) ``` When writing the DAG file, you should also add the `access_control` parameter to the DAG configuration. @@ -96,13 +100,12 @@ If you do not specify the `access_control` option, project members will not be a ```python dag = DAG( - dag_id = "example_dag", - default_args = args, - access_control = { + dag_id="example_dag", + default_args=args, + access_control={ "project_name": {"can_dag_read", "can_dag_edit"}, }, - - schedule_interval = "0 4 * * *" + schedule_interval="0 4 * * *", ) ``` diff --git a/docs/user_guides/projects/git/clone_repo.md b/docs/user_guides/projects/git/clone_repo.md index 342ed7b076..04766d46e4 100644 --- a/docs/user_guides/projects/git/clone_repo.md +++ b/docs/user_guides/projects/git/clone_repo.md @@ -85,26 +85,26 @@ You can also clone a repository through the hopsworks git API in python. ### Step 1: Get the git API ```python - import hopsworks project = hopsworks.login() git_api = project.get_git_api() - ``` ### Step 2: Clone the repository ```python - -REPO_URL="https://github.com/logicalclocks/hops-examples.git" # git repository -HOPSWORKS_FOLDER="Jupyter" # path in Hopsworks filesystem to clone to -PROVIDER="GitHub" -BRANCH="master" # optional branch to clone - -examples_repo = git_api.clone(REPO_URL, HOPSWORKS_FOLDER, PROVIDER, branch=BRANCH) - +REPO_URL = ( + "https://github.com/logicalclocks/hops-examples.git" # git repository +) +HOPSWORKS_FOLDER = "Jupyter" # path in Hopsworks filesystem to clone to +PROVIDER = "GitHub" +BRANCH = "master" # optional branch to clone + +examples_repo = git_api.clone( + REPO_URL, HOPSWORKS_FOLDER, PROVIDER, branch=BRANCH +) ``` ### API Reference diff --git a/docs/user_guides/projects/git/configure_git_provider.md b/docs/user_guides/projects/git/configure_git_provider.md index 5ee1ab7a13..25a95319e4 100644 --- a/docs/user_guides/projects/git/configure_git_provider.md +++ b/docs/user_guides/projects/git/configure_git_provider.md @@ -63,25 +63,21 @@ You can also configure a git provider using the hopsworks git API in python. ### Step 1: Get the git API ```python - import hopsworks project = hopsworks.login() git_api = project.get_git_api() - ``` ### Step 2: Configure git provider ```python - -PROVIDER="GitHub" -GITHUB_USER="my_user" -API_TOKEN="my_token" +PROVIDER = "GitHub" +GITHUB_USER = "my_user" +API_TOKEN = "my_token" git_api.set_provider(PROVIDER, GITHUB_USER, API_TOKEN) - ``` ### API Reference diff --git a/docs/user_guides/projects/git/repository_actions.md b/docs/user_guides/projects/git/repository_actions.md index c7c0095caa..3ea1e8fecf 100644 --- a/docs/user_guides/projects/git/repository_actions.md +++ b/docs/user_guides/projects/git/repository_actions.md @@ -45,27 +45,23 @@ You can also perform the repository actions using the hopsworks git API in pytho ### Step 1: Get the git API ```python - import hopsworks project = hopsworks.login() git_api = project.get_git_api() - ``` ### Step 2: Get the git repository ```python git_repo = git_api.get_repo(REPOSITORY_NAME) - ``` ### Step 3: Perform the git repository action e.g commit ```python git_repo = git_api.commit("Test commit") - ``` ### API Reference diff --git a/docs/user_guides/projects/jobs/notebook_job.md b/docs/user_guides/projects/jobs/notebook_job.md index 6b671b441d..2630f1a9a5 100644 --- a/docs/user_guides/projects/jobs/notebook_job.md +++ b/docs/user_guides/projects/jobs/notebook_job.md @@ -145,7 +145,6 @@ This snippet assumes the Jupyter Notebook script is in the current working direc It will upload the Jupyter Notebook script to the `Resources` dataset in your project. ```python - import hopsworks project = hopsworks.login() @@ -153,7 +152,6 @@ project = hopsworks.login() dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("notebook.ipynb", "Resources") - ``` ### Step 2: Create Jupyter Notebook job @@ -161,19 +159,17 @@ uploaded_file_path = dataset_api.upload("notebook.ipynb", "Resources") In this snippet we get the `JobsApi` object to get the default job configuration for a `PYTHON` job, set the jupyter notebook file and override the environment to run in, and finally create the `Job` object. ```python - jobs_api = project.get_job_api() notebook_job_config = jobs_api.get_configuration("PYTHON") # Set the application file -notebook_job_config['appPath'] = uploaded_file_path +notebook_job_config["appPath"] = uploaded_file_path # Override the python job environment -notebook_job_config['environmentName'] = "python-feature-pipeline" +notebook_job_config["environmentName"] = "python-feature-pipeline" job = jobs_api.create_job("notebook_job", notebook_job_config) - ``` ### Step 3: Execute the job @@ -181,9 +177,8 @@ job = jobs_api.create_job("notebook_job", notebook_job_config) In this code snippet, we execute the job with arguments and wait until it reaches a terminal state. ```python - # Run the job -execution = job.run(args='-p a 2 -p b 5', await_termination=True) +execution = job.run(args="-p a 2 -p b 5", await_termination=True) ``` ## Configuration diff --git a/docs/user_guides/projects/jobs/pyspark_job.md b/docs/user_guides/projects/jobs/pyspark_job.md index 2921f7bcf2..5ac93555cc 100644 --- a/docs/user_guides/projects/jobs/pyspark_job.md +++ b/docs/user_guides/projects/jobs/pyspark_job.md @@ -174,7 +174,6 @@ This snippet assumes the program to run is in the current working directory and It will upload the python script to the `Resources` dataset in your project. ```python - import hopsworks project = hopsworks.login() @@ -182,7 +181,6 @@ project = hopsworks.login() dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("script.py", "Resources") - ``` ### Step 2: Create PySpark job @@ -190,19 +188,17 @@ uploaded_file_path = dataset_api.upload("script.py", "Resources") In this snippet we get the `JobsApi` object to get the default job configuration for a `PYSPARK` job, set the pyspark script and override the environment to run in, and finally create the `Job` object. ```python - jobs_api = project.get_job_api() spark_config = jobs_api.get_configuration("PYSPARK") # Set the application file -spark_config['appPath'] = uploaded_file_path +spark_config["appPath"] = uploaded_file_path # Override the python job environment -spark_config['environmentName'] = "spark-feature-pipeline" +spark_config["environmentName"] = "spark-feature-pipeline" job = jobs_api.create_job("pyspark_job", spark_config) - ``` ### Step 3: Execute the job @@ -210,7 +206,6 @@ job = jobs_api.create_job("pyspark_job", spark_config) In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs. ```python - execution = job.run(await_termination=True) out, err = execution.download_logs() @@ -220,7 +215,6 @@ print(f_out.read()) f_err = open(err, "r") print(f_err.read()) - ``` ## Configuration @@ -259,7 +253,9 @@ To read a dataset in your project using Spark, use the full filesystem path wher For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: ```python -df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True) +df = spark.read.csv( + "/Projects/my_project/Resources/data.csv", header=True, inferSchema=True +) df.show() ``` diff --git a/docs/user_guides/projects/jobs/python_job.md b/docs/user_guides/projects/jobs/python_job.md index 5252f50818..9ab3453a20 100644 --- a/docs/user_guides/projects/jobs/python_job.md +++ b/docs/user_guides/projects/jobs/python_job.md @@ -130,7 +130,6 @@ This snippet assumes the python script is in the current working directory and n It will upload the python script to the `Resources` dataset in your project. ```python - import hopsworks project = hopsworks.login() @@ -138,7 +137,6 @@ project = hopsworks.login() dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("script.py", "Resources") - ``` ### Step 2: Create Python job @@ -146,19 +144,17 @@ uploaded_file_path = dataset_api.upload("script.py", "Resources") In this snippet we get the `JobsApi` object to get the default job configuration for a `PYTHON` job, set the python script and override the environment to run in, and finally create the `Job` object. ```python - jobs_api = project.get_job_api() py_job_config = jobs_api.get_configuration("PYTHON") # Set the application file -py_job_config['appPath'] = uploaded_file_path +py_job_config["appPath"] = uploaded_file_path # Override the python job environment -py_job_config['environmentName'] = "python-feature-pipeline" +py_job_config["environmentName"] = "python-feature-pipeline" job = jobs_api.create_job("py_job", py_job_config) - ``` ### Step 3: Execute the job @@ -166,7 +162,6 @@ job = jobs_api.create_job("py_job", py_job_config) In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs. ```python - # Run the job execution = job.run(await_termination=True) @@ -178,7 +173,6 @@ print(f_out.read()) f_err = open(err, "r") print(f_err.read()) - ``` ## Configuration diff --git a/docs/user_guides/projects/jobs/ray_job.md b/docs/user_guides/projects/jobs/ray_job.md index e558ff58ad..0c6df6c862 100644 --- a/docs/user_guides/projects/jobs/ray_job.md +++ b/docs/user_guides/projects/jobs/ray_job.md @@ -175,7 +175,6 @@ If the file is already in the project, you can skip this step. It will upload the jar to the `Resources` dataset in your project. ```python - import hopsworks project = hopsworks.login() @@ -183,7 +182,6 @@ project = hopsworks.login() dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("ray_job.py", "Resources") - ``` ### Step 2: Create Ray job @@ -191,22 +189,20 @@ uploaded_file_path = dataset_api.upload("ray_job.py", "Resources") In this snippet we get the `JobsApi` object to get the default job configuration for a `RAY` job, set the python script to run and create the `Job` object. ```python - jobs_api = project.get_job_api() ray_config = jobs_api.get_configuration("RAY") -ray_config['appPath'] = uploaded_file_path -ray_config['environmentName'] = "ray-training-pipeline" -ray_config['driverCores'] = 2 -ray_config['driverMemory'] = 2048 -ray_config['workerCores'] = 2 -ray_config['workerMemory'] = 4096 -ray_config['minWorkers'] = 1 -ray_config['maxWorkers'] = 4 +ray_config["appPath"] = uploaded_file_path +ray_config["environmentName"] = "ray-training-pipeline" +ray_config["driverCores"] = 2 +ray_config["driverMemory"] = 2048 +ray_config["workerCores"] = 2 +ray_config["workerMemory"] = 4096 +ray_config["minWorkers"] = 1 +ray_config["maxWorkers"] = 4 job = jobs_api.create_job("ray_job", ray_config) - ``` ### Step 3: Execute the job @@ -214,7 +210,6 @@ job = jobs_api.create_job("ray_job", ray_config) In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs. ```python - execution = job.run(await_termination=True) out, err = execution.download_logs() @@ -224,7 +219,6 @@ print(f_out.read()) f_err = open(err, "r") print(f_err.read()) - ``` ## Configuration diff --git a/docs/user_guides/projects/jobs/spark_job.md b/docs/user_guides/projects/jobs/spark_job.md index 1fb25ff610..925b6d97d8 100644 --- a/docs/user_guides/projects/jobs/spark_job.md +++ b/docs/user_guides/projects/jobs/spark_job.md @@ -178,7 +178,6 @@ This snippet assumes the Spark program is in the current working directory and n It will upload the jar to the `Resources` dataset in your project. ```python - import hopsworks project = hopsworks.login() @@ -186,7 +185,6 @@ project = hopsworks.login() dataset_api = project.get_dataset_api() uploaded_file_path = dataset_api.upload("sparkpi.jar", "Resources") - ``` ### Step 2: Create Spark job @@ -194,16 +192,14 @@ uploaded_file_path = dataset_api.upload("sparkpi.jar", "Resources") In this snippet we get the `JobsApi` object to get the default job configuration for a `SPARK` job, set the python script to run and create the `Job` object. ```python - jobs_api = project.get_job_api() spark_config = jobs_api.get_configuration("SPARK") -spark_config['appPath'] = uploaded_file_path -spark_config['mainClass'] = 'org.apache.spark.examples.SparkPi' +spark_config["appPath"] = uploaded_file_path +spark_config["mainClass"] = "org.apache.spark.examples.SparkPi" job = jobs_api.create_job("pyspark_job", spark_config) - ``` ### Step 3: Execute the job @@ -211,7 +207,6 @@ job = jobs_api.create_job("pyspark_job", spark_config) In this snippet we execute the job synchronously, that is wait until it reaches a terminal state, and then download and print the logs. ```python - execution = job.run(await_termination=True) out, err = execution.download_logs() @@ -221,7 +216,6 @@ print(f_out.read()) f_err = open(err, "r") print(f_err.read()) - ``` ## Configuration diff --git a/docs/user_guides/projects/jupyter/spark_notebook.md b/docs/user_guides/projects/jupyter/spark_notebook.md index 17eeb21674..f0ad43bc82 100644 --- a/docs/user_guides/projects/jupyter/spark_notebook.md +++ b/docs/user_guides/projects/jupyter/spark_notebook.md @@ -147,7 +147,9 @@ To read a dataset in your project using Spark, use the full filesystem path wher For example, to read a CSV file named `data.csv` located in the `Resources` dataset of a project called `my_project`: ```python -df = spark.read.csv("/Projects/my_project/Resources/data.csv", header=True, inferSchema=True) +df = spark.read.csv( + "/Projects/my_project/Resources/data.csv", header=True, inferSchema=True +) df.show() ``` diff --git a/docs/user_guides/projects/kafka/consume_messages.md b/docs/user_guides/projects/kafka/consume_messages.md index 6874d903ac..72e7be6e12 100644 --- a/docs/user_guides/projects/kafka/consume_messages.md +++ b/docs/user_guides/projects/kafka/consume_messages.md @@ -16,39 +16,33 @@ In this guide, you will learn how to consume messages from a kafka topic. ### Step 1: Get the Kafka API ```python - import hopsworks project = hopsworks.login() kafka_api = project.get_kafka_api() - ``` ### Step 2: Configure confluent-kafka client ```python - consumer_config = kafka_api.get_default_config() -consumer_config['default.topic.config'] = {'auto.offset.reset': 'earliest'} +consumer_config["default.topic.config"] = {"auto.offset.reset": "earliest"} from confluent_kafka import Consumer consumer = Consumer(consumer_config) - ``` ### Step 3: Consume messages from a topic ```python - # Subscribe to topic consumer.subscribe(["my_topic"]) for i in range(0, 10): msg = consumer.poll(timeout=10.0) print(msg.value()) - ``` ### API Reference diff --git a/docs/user_guides/projects/kafka/create_schema.md b/docs/user_guides/projects/kafka/create_schema.md index cebd9fe307..e36e070c0f 100644 --- a/docs/user_guides/projects/kafka/create_schema.md +++ b/docs/user_guides/projects/kafka/create_schema.md @@ -9,13 +9,11 @@ In this guide, you will learn how to create a Kafka Avro Schema in the Hopsworks ### Step 1: Get the Kafka API ```python - import hopsworks project = hopsworks.login() kafka_api = project.get_kafka_api() - ``` ### Step 2: Define the schema @@ -23,25 +21,14 @@ kafka_api = project.get_kafka_api() Define the Avro Schema, see [types](https://avro.apache.org/docs/current/spec.html#schema_primitive) for the format of the schema. ```python - schema = { "type": "record", "name": "tutorial", "fields": [ - { - "name": "id", - "type": "int" - }, - { - "name": "data", - "type": "string" - } - ] + {"name": "id", "type": "int"}, + {"name": "data", "type": "string"}, + ], } - - - - ``` ### Step 3: Create the schema @@ -49,11 +36,9 @@ schema = { Create the schema in the Schema Registry. ```python - -SCHEMA_NAME="schema_example" +SCHEMA_NAME = "schema_example" my_schema = kafka_api.create_schema(SCHEMA_NAME, schema) - ``` ### API Reference diff --git a/docs/user_guides/projects/kafka/create_topic.md b/docs/user_guides/projects/kafka/create_topic.md index 7ab420b4b7..a6c3826a48 100644 --- a/docs/user_guides/projects/kafka/create_topic.md +++ b/docs/user_guides/projects/kafka/create_topic.md @@ -16,24 +16,22 @@ In this guide, you will learn how to create a Kafka Topic. ### Step 1: Get the Kafka API ```python - import hopsworks project = hopsworks.login() kafka_api = project.get_kafka_api() - ``` ### Step 2: Define the schema ```python +TOPIC_NAME = "topic_example" +SCHEMA_NAME = "schema_example" -TOPIC_NAME="topic_example" -SCHEMA_NAME="schema_example" - -my_topic = kafka_api.create_topic(TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1) - +my_topic = kafka_api.create_topic( + TOPIC_NAME, SCHEMA_NAME, 1, replicas=1, partitions=1 +) ``` ### API Reference diff --git a/docs/user_guides/projects/kafka/produce_messages.md b/docs/user_guides/projects/kafka/produce_messages.md index a44a3804c3..4f97630d20 100644 --- a/docs/user_guides/projects/kafka/produce_messages.md +++ b/docs/user_guides/projects/kafka/produce_messages.md @@ -16,41 +16,37 @@ In this guide, you will learn how to produce messages to a kafka topic. ### Step 1: Get the Kafka API ```python - import hopsworks project = hopsworks.login() kafka_api = project.get_kafka_api() - ``` ### Step 2: Configure confluent-kafka client ```python - producer_config = kafka_api.get_default_config() from confluent_kafka import Producer producer = Producer(producer_config) - ``` ### Step 3: Produce messages to topic ```python - -import uuid import json +import uuid # Send a few messages for i in range(0, 10): - producer.produce("my_topic", json.dumps({"id": i, "data": str(uuid.uuid1())}), "key") + producer.produce( + "my_topic", json.dumps({"id": i, "data": str(uuid.uuid1())}), "key" + ) # Trigger the sending of all messages to the brokers, 10 sec timeout producer.flush(10) - ``` ### API Reference diff --git a/docs/user_guides/projects/opensearch/connect.md b/docs/user_guides/projects/opensearch/connect.md index 00d701f088..481ac97437 100644 --- a/docs/user_guides/projects/opensearch/connect.md +++ b/docs/user_guides/projects/opensearch/connect.md @@ -14,23 +14,19 @@ In this guide, you will learn how to connect to the OpenSearch cluster using an ### Step 1: Get the OpenSearch API ```python - import hopsworks project = hopsworks.login() opensearch_api = project.get_opensearch_api() - ``` ### Step 2: Configure the opensearch-py client ```python - from opensearchpy import OpenSearch client = OpenSearch(**opensearch_api.get_default_py_config()) - ``` ### API Reference diff --git a/docs/user_guides/projects/opensearch/knn.md b/docs/user_guides/projects/opensearch/knn.md index a1c91db57f..30f54e7f3e 100644 --- a/docs/user_guides/projects/opensearch/knn.md +++ b/docs/user_guides/projects/opensearch/knn.md @@ -17,6 +17,7 @@ In this guide, you will learn how to create a simple recommendation application, ### Step 1: Get the OpenSearch API === "Python" + ```python import hopsworks @@ -28,6 +29,7 @@ In this guide, you will learn how to create a simple recommendation application, ### Step 2: Configure the opensearch-py client === "Python" + ```python from opensearchpy import OpenSearch @@ -39,6 +41,7 @@ In this guide, you will learn how to create a simple recommendation application, Create an index to use by calling `opensearch_api.get_project_index(..)`. === "Python" + ```python knn_index_name = opensearch_api.get_project_index("demo_knn_index") @@ -48,13 +51,8 @@ Create an index to use by calling `opensearch_api.get_project_index(..)`. "knn.algo_param.ef_search": 100, }, "mappings": { - "properties": { - "my_vector1": { - "type": "knn_vector", - "dimension": 2 - } - } - } + "properties": {"my_vector1": {"type": "knn_vector", "dimension": 2}} + }, } response = client.indices.create(knn_index_name, body=index_body) @@ -68,17 +66,19 @@ Ingest 10 vectors in a bulk fashion to the index. These vectors represent the list of vectors to calculate the similarity for. === "Python" + ```python - from opensearchpy.helpers import bulk import random + from opensearchpy.helpers import bulk + actions = [ { "_index": knn_index_name, "_id": count, "_source": { "my_vector1": [random.uniform(0, 10), random.uniform(0, 10)], - } + }, } for count in range(0, 10) ] @@ -94,28 +94,20 @@ These vectors represent the list of vectors to calculate the similarity for. Score the vector `[2.5, 3]` and find the 3 most similar vectors. === "Python" + ```python # Define the search request query = { "size": 3, - "query": { - "knn": { - "my_vector1": { - "vector": [2.5, 3], - "k": 3 - } - } - } + "query": {"knn": {"my_vector1": {"vector": [2.5, 3], "k": 3}}}, } # Perform the similarity search - response = client.search( - body = query, - index = knn_index_name - ) + response = client.search(body=query, index=knn_index_name) # Pretty print response import pprint + pp = pprint.PrettyPrinter() pp.pprint(response) ``` diff --git a/docs/user_guides/projects/python/custom_commands.md b/docs/user_guides/projects/python/custom_commands.md index 892b5995a2..9d9f151f3a 100644 --- a/docs/user_guides/projects/python/custom_commands.md +++ b/docs/user_guides/projects/python/custom_commands.md @@ -44,12 +44,11 @@ You can also run the custom commands using the REST API. From the REST API, you should provide the path, in HOPSFS, to the bash script and the artifacts(comma separated string of paths in HopsFs). The REST API endpoint for running custom commands is: `hopsworks-api/api/project//python/environments//commands/custom` and the body should look like this: -```python +```json { "commandsFile": "", "artifacts": "" } - ``` ## What to include in the bash script diff --git a/docs/user_guides/projects/python/python_env_clone.md b/docs/user_guides/projects/python/python_env_clone.md index 3ad5b1965d..dd47e26847 100644 --- a/docs/user_guides/projects/python/python_env_clone.md +++ b/docs/user_guides/projects/python/python_env_clone.md @@ -1,60 +1,60 @@ -# How To Clone Python Environment - -## Introduction - -Cloning an environment in Hopsworks means creating a copy of one of the base environments. -The base environments are immutable, meaning that it is required to clone an environment before you can make any change to it, such as installing your own libraries. -This ensures that the project maintains a set of stable environments that are tested with the capabilities of the platform, meanwhile through cloning, allowing users to further customize an environment without affecting the base environments. - -In this guide, you will learn how to clone an environment. - -## Step 1: Select an environment - -Under the `Project settings` section you can find the `Python environment` setting. - -First select an environment, for example the `python-feature-pipeline`. - -

-

- -
Select a base environment
-
-

- -## Step 2: Clone environment - -The environment can now be cloned by clicking `Clone env` and entering a name and description. -The interface will show `Syncing packages` while creating the environment. - -

-

- Create environment -
Clone a base environment
-
-

- -## Step 3: Environment is now ready - -

-

- -
Environment is now cloned
-
-

- -!!! notice "What does the CUSTOM mean?" - Notice that the cloned environment is tagged as `CUSTOM`, it means that it is a base environment which has been cloned. - -!!! notice "Base environment also marked" - When you select a `CUSTOM` environment the base environment it was cloned from is also shown. - -## Concerning upgrades - -!!! warning "Please note" - The base environments are automatically upgraded when Hopsworks is upgraded and application code should keep functioning provided that no breaking changes were made in the upgraded version of the environment. - A `CUSTOM` environment is not automatically upgraded and the users is recommended to reapply the modifications to a base environment if they encounter issues after an upgrade. - -## Next steps - -In this guide you learned how to clone a new environment. -The next step is to [install](python_install.md) a library in the environment. +# How To Clone Python Environment + +## Introduction + +Cloning an environment in Hopsworks means creating a copy of one of the base environments. +The base environments are immutable, meaning that it is required to clone an environment before you can make any change to it, such as installing your own libraries. +This ensures that the project maintains a set of stable environments that are tested with the capabilities of the platform, meanwhile through cloning, allowing users to further customize an environment without affecting the base environments. + +In this guide, you will learn how to clone an environment. + +## Step 1: Select an environment + +Under the `Project settings` section you can find the `Python environment` setting. + +First select an environment, for example the `python-feature-pipeline`. + +

+

+ +
Select a base environment
+
+

+ +## Step 2: Clone environment + +The environment can now be cloned by clicking `Clone env` and entering a name and description. +The interface will show `Syncing packages` while creating the environment. + +

+

+ Create environment +
Clone a base environment
+
+

+ +## Step 3: Environment is now ready + +

+

+ +
Environment is now cloned
+
+

+ +!!! notice "What does the CUSTOM mean?" + Notice that the cloned environment is tagged as `CUSTOM`, it means that it is a base environment which has been cloned. + +!!! notice "Base environment also marked" + When you select a `CUSTOM` environment the base environment it was cloned from is also shown. + +## Concerning upgrades + +!!! warning "Please note" + The base environments are automatically upgraded when Hopsworks is upgraded and application code should keep functioning provided that no breaking changes were made in the upgraded version of the environment. + A `CUSTOM` environment is not automatically upgraded and the users is recommended to reapply the modifications to a base environment if they encounter issues after an upgrade. + +## Next steps + +In this guide you learned how to clone a new environment. +The next step is to [install](python_install.md) a library in the environment. diff --git a/docs/user_guides/projects/python/python_env_overview.md b/docs/user_guides/projects/python/python_env_overview.md index c9f190d2a7..e9ae4a0ce3 100644 --- a/docs/user_guides/projects/python/python_env_overview.md +++ b/docs/user_guides/projects/python/python_env_overview.md @@ -1,66 +1,66 @@ -# Python Environments - -## Introduction - -Hopsworks postulates that building ML systems following the FTI pipeline architecture is best practice. -This architecture consists of three independently developed and operated ML pipelines: - -- Feature Pipeline: takes as input raw data that it transforms into features (and labels) -- Training Pipeline: takes as input features (and labels) and outputs a trained model -- Inference Pipeline: takes new feature data and a trained model and makes predictions. - -In order to facilitate the development of these pipelines Hopsworks bundles several python environments containing necessary dependencies. -Each environment can also be customized further by installing additional dependencies from PyPi, Conda, Wheel files, GitHub repos or applying custom Dockerfiles on top. - -### Step 1: Go to environments page - -Under the `Project settings` section you can find the `Python environment` setting. - -### Step 2: List available environments - -Environments listed under `FEATURE ENGINEERING` corresponds to environments you would use in a feature pipeline, `MODEL TRAINING` maps to environments used in a training pipeline and `MODEL INFERENCE` are what you would use in inference pipelines. - -

-

- Bundled python environments -
Bundled python environments
-
-

- -!!! note "Python version" - The python version used in all the environments is 3.11. - -### Feature engineering - -The `FEATURE ENGINEERING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md), a [Python job](../jobs/python_job.md) or a [PySpark job](../jobs/pyspark_job.md). - -- `python-feature-pipeline` for writing feature pipelines using Python -- `spark-feature-pipeline` for writing feature pipelines using PySpark - -### Model training - -The `MODEL TRAINING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md) or a [Python job](../jobs/python_job.md) or in a [Ray job](../jobs/ray_job.md). - -- `tensorflow-training-pipeline` to train TensorFlow models -- `torch-training-pipeline` to train PyTorch models -- `pandas-training-pipeline` to train XGBoost, Catboost and Sklearn models -- `ray_training_pipeline` a general purpose environment for distributed training using Ray framework to train XGBoost and Sklearn models. - Should be used in [Ray job](../jobs/ray_job.md). - It can be customized to install additional dependencies of your choice. -- `ray_torch_training_pipeline` for distributed training of PyTorch models using Ray framework in a [Ray job](../jobs/ray_job.md) -- `ray_tensorflow_training_pipeline` for distributed training of TensorFlow models using Ray framework in a [Ray job](../jobs/ray_job.md) - -### Model inference - -The `MODEL INFERENCE` environments can be used in a deployment using a custom predictor script. - -- `tensorflow-inference-pipeline` to load and serve TensorFlow models -- `torch-inference-pipeline` to load and serve PyTorch models -- `pandas-inference-pipeline` to load and serve XGBoost, Catboost and Sklearn models -- `vllm-inference-pipeline` to load and serve LLMs with vLLM inference engine -- `minimal-inference-pipeline` to install your own custom framework, contains a minimal set of dependencies - -## Next steps - -In this guide you learned how to find the bundled python environments and where they can be used. -Now you can test out the environment in a [Jupyter notebook](../jupyter/python_notebook.md). +# Python Environments + +## Introduction + +Hopsworks postulates that building ML systems following the FTI pipeline architecture is best practice. +This architecture consists of three independently developed and operated ML pipelines: + +- Feature Pipeline: takes as input raw data that it transforms into features (and labels) +- Training Pipeline: takes as input features (and labels) and outputs a trained model +- Inference Pipeline: takes new feature data and a trained model and makes predictions. + +In order to facilitate the development of these pipelines Hopsworks bundles several python environments containing necessary dependencies. +Each environment can also be customized further by installing additional dependencies from PyPi, Conda, Wheel files, GitHub repos or applying custom Dockerfiles on top. + +### Step 1: Go to environments page + +Under the `Project settings` section you can find the `Python environment` setting. + +### Step 2: List available environments + +Environments listed under `FEATURE ENGINEERING` corresponds to environments you would use in a feature pipeline, `MODEL TRAINING` maps to environments used in a training pipeline and `MODEL INFERENCE` are what you would use in inference pipelines. + +

+

+ Bundled python environments +
Bundled python environments
+
+

+ +!!! note "Python version" + The python version used in all the environments is 3.11. + +### Feature engineering + +The `FEATURE ENGINEERING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md), a [Python job](../jobs/python_job.md) or a [PySpark job](../jobs/pyspark_job.md). + +- `python-feature-pipeline` for writing feature pipelines using Python +- `spark-feature-pipeline` for writing feature pipelines using PySpark + +### Model training + +The `MODEL TRAINING` environments can be used in [Jupyter notebooks](../jupyter/python_notebook.md) or a [Python job](../jobs/python_job.md) or in a [Ray job](../jobs/ray_job.md). + +- `tensorflow-training-pipeline` to train TensorFlow models +- `torch-training-pipeline` to train PyTorch models +- `pandas-training-pipeline` to train XGBoost, Catboost and Sklearn models +- `ray_training_pipeline` a general purpose environment for distributed training using Ray framework to train XGBoost and Sklearn models. + Should be used in [Ray job](../jobs/ray_job.md). + It can be customized to install additional dependencies of your choice. +- `ray_torch_training_pipeline` for distributed training of PyTorch models using Ray framework in a [Ray job](../jobs/ray_job.md) +- `ray_tensorflow_training_pipeline` for distributed training of TensorFlow models using Ray framework in a [Ray job](../jobs/ray_job.md) + +### Model inference + +The `MODEL INFERENCE` environments can be used in a deployment using a custom predictor script. + +- `tensorflow-inference-pipeline` to load and serve TensorFlow models +- `torch-inference-pipeline` to load and serve PyTorch models +- `pandas-inference-pipeline` to load and serve XGBoost, Catboost and Sklearn models +- `vllm-inference-pipeline` to load and serve LLMs with vLLM inference engine +- `minimal-inference-pipeline` to install your own custom framework, contains a minimal set of dependencies + +## Next steps + +In this guide you learned how to find the bundled python environments and where they can be used. +Now you can test out the environment in a [Jupyter notebook](../jupyter/python_notebook.md). diff --git a/docs/user_guides/projects/scheduling/kube_scheduler.md b/docs/user_guides/projects/scheduling/kube_scheduler.md index df352ddfa8..fa1db20b9c 100644 --- a/docs/user_guides/projects/scheduling/kube_scheduler.md +++ b/docs/user_guides/projects/scheduling/kube_scheduler.md @@ -92,18 +92,18 @@ This can be done from the `Available in Hopsworks` sub-section. In order to be able to list all the Kubernetes Node Labels, Hopsworks requires the following cluster role: - ``` - - apiGroups: [""] - resources: ["nodes"] - verbs: ["get", "list"] + ```yaml + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list"] ``` In order to be able to list all the Kubernetes Cluster Priority Classes, Hopsworsk requires this cluster role: - ``` - - apiGroups: ["scheduling.k8s.io"] - resources: ["priorityclasses"] - verbs: ["get", "list"] + ```yaml + - apiGroups: ["scheduling.k8s.io"] + resources: ["priorityclasses"] + verbs: ["get", "list"] ``` If the roles above are configured properly (default behaviour), admins can only select values from the drop down menu. diff --git a/mkdocs.yml b/mkdocs.yml index 749cebddbc..2660cc3525 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -364,6 +364,10 @@ plugins: link_source: true extensions: - hopsworks_apigen.mkdocs + docstring_style: google + docstring_options: + ignore_init_summary: false + merge_init_into_class: false inventories: - https://docs.python.org/3/objects.inv - https://pandas.pydata.org/docs/objects.inv @@ -372,7 +376,12 @@ plugins: - https://docs.pydantic.dev/latest/objects.inv - https://fastapi.tiangolo.com/objects.inv - https://scikit-learn.org/stable/objects.inv + - https://arrow.apache.org/docs/objects.inv - https://docs.pola.rs/api/python/stable/objects.inv + - url: file:./docs/polars_patch.inv + base_url: https://docs.pola.rs/api/python/stable/ + - url: file:./docs/great_expectations.inv + base_url: https://docs.greatexpectations.io/docs/0.18/reference/api/ markdown_extensions: - admonition diff --git a/requirements-docs.txt b/requirements-docs.txt index 3e50579236..089dda472d 100644 --- a/requirements-docs.txt +++ b/requirements-docs.txt @@ -4,7 +4,7 @@ mike==2.1.3 markdown==3.9 pymdown-extensions==10.17.2 mkdocs-minify-plugin>=0.2.0 -hopsworks-apigen==1.0.0 +hopsworks-apigen==1.0.3 mkdocstrings[python]==1.0.3 mkdocstrings-python==2.0.2 mkdocs-autorefs==1.4.4