From a50b6f26a632df94f113366266afb42899dc3fdd Mon Sep 17 00:00:00 2001
From: Alan Jowett <alan.jowett@microsoft.com>
Date: Sun, 29 Mar 2026 19:17:20 -0700
Subject: [PATCH 1/3] Add prompt graph integrity checks (issue #111 step 1)

Add tests/validate-graph-integrity.py that validates structural
integrity across all PromptKit components:

- Broken paths: manifest path fields point to actual files on disk
- Broken references: template persona/protocol/format/taxonomy refs
  resolve to entries in the manifest
- Orphaned files: component files on disk not listed in the manifest
- Missing companions: templates lacking required persona or protocols
- Pipeline integrity: pipeline stage templates exist in the manifest

Update CI workflow to run the new check alongside the existing
manifest protocol sync validator, and trigger on all component
directories (personas/, protocols/, formats/, taxonomies/).

Closes the 'Prompt Graph Integrity Checks' milestone in #111.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .github/workflows/validate-manifest.yml |  26 +-
 tests/validate-graph-integrity.py       | 346 ++++++++++++++++++++++++
 2 files changed, 371 insertions(+), 1 deletion(-)
 create mode 100644 tests/validate-graph-integrity.py

diff --git a/.github/workflows/validate-manifest.yml b/.github/workflows/validate-manifest.yml
index 12823ff..cf31ef5 100644
--- a/.github/workflows/validate-manifest.yml
+++ b/.github/workflows/validate-manifest.yml
@@ -1,19 +1,29 @@
 # SPDX-License-Identifier: MIT
 # Copyright (c) PromptKit Contributors
 
-name: Validate Manifest
+name: Validate Prompt Library
 
 on:
   push:
     paths:
       - 'manifest.yaml'
+      - 'personas/**'
+      - 'protocols/**'
+      - 'formats/**'
+      - 'taxonomies/**'
       - 'templates/**'
       - 'tests/validate-manifest.py'
+      - 'tests/validate-graph-integrity.py'
   pull_request:
     paths:
       - 'manifest.yaml'
+      - 'personas/**'
+      - 'protocols/**'
+      - 'formats/**'
+      - 'taxonomies/**'
       - 'templates/**'
       - 'tests/validate-manifest.py'
+      - 'tests/validate-graph-integrity.py'
 
 jobs:
   validate-manifest:
@@ -29,3 +39,17 @@ jobs:
 
       - name: Validate manifest protocols
         run: python tests/validate-manifest.py
+
+  validate-graph-integrity:
+    name: Check prompt graph integrity
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+
+      - name: Validate graph integrity
+        run: python tests/validate-graph-integrity.py
diff --git a/tests/validate-graph-integrity.py b/tests/validate-graph-integrity.py
new file mode 100644
index 0000000..a2e4826
--- /dev/null
+++ b/tests/validate-graph-integrity.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: MIT
+# Copyright (c) PromptKit Contributors
+
+"""Validate prompt graph integrity across all PromptKit components.
+
+This script ensures that all prompt components reference each other
+correctly, that no components are orphaned, and that required
+companion components are present.
+
+Checks performed:
+  1. Broken paths      — manifest ``path`` fields point to actual files
+  2. Broken references — template persona/protocol/format/taxonomy refs
+                         resolve to entries in the manifest
+  3. Orphaned files    — component files on disk not listed in the manifest
+  4. Missing companions — templates lacking required persona/protocol/format
+  5. Pipeline integrity — pipeline stage templates exist in the manifest
+
+Exit code 0 = all checks pass.
+Exit code 1 = one or more issues detected.
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Lightweight YAML helpers (avoids external dependencies)
+# ---------------------------------------------------------------------------
+
+
+def _parse_inline_list(text: str) -> list[str]:
+    """Parse an inline YAML list: ``[a, b, c]`` → ``['a', 'b', 'c']``."""
+    match = re.search(r"\[(.+?)]", text)
+    if match:
+        return [item.strip().strip("'\"") for item in match.group(1).split(",")]
+    return []
+
+
+def _split_sections(text: str) -> dict[str, str]:
+    """Split manifest text into top-level sections by unindented keys.
+
+    Returns ``{section_name: text_of_all_lines_under_that_key}``.
+    """
+    sections: dict[str, str] = {}
+    current_key: str | None = None
+    current_lines: list[str] = []
+
+    for line in text.splitlines():
+        if line and not line[0].isspace() and ":" in line and not line.startswith("#"):
+            if current_key is not None:
+                sections[current_key] = "\n".join(current_lines)
+            current_key = line.split(":")[0].strip()
+            current_lines = []
+        else:
+            current_lines.append(line)
+
+    if current_key is not None:
+        sections[current_key] = "\n".join(current_lines)
+
+    return sections
+
+
+def _parse_entries(
+    text: str,
+    extra_fields: tuple[str, ...] = (),
+) -> list[dict[str, object]]:
+    """Parse ``- name:`` entries and extract ``path`` plus *extra_fields*.
+
+    Works regardless of YAML nesting depth — it finds each ``- name:``
+    line and scans forward for sibling fields (exactly two spaces deeper)
+    until the indent drops back.
+    """
+    entries: list[dict[str, object]] = []
+    lines = text.splitlines()
+    i = 0
+
+    while i < len(lines):
+        stripped = lines[i].strip()
+
+        if stripped.startswith("- name:"):
+            entry_indent = len(lines[i]) - len(lines[i].lstrip())
+            entry: dict[str, object] = {
+                "name": stripped.split(":", 1)[1].strip().strip("'\""),
+                "path": "",
+            }
+            # Initialise list-valued fields so callers can iterate safely
+            for f in extra_fields:
+                if f not in entry:
+                    entry[f] = ""
+
+            j = i + 1
+            while j < len(lines):
+                fline = lines[j]
+                fstripped = fline.strip()
+
+                if not fstripped or fstripped.startswith("#"):
+                    j += 1
+                    continue
+
+                findent = len(fline) - len(fline.lstrip())
+                if findent <= entry_indent:
+                    break  # next entry or section boundary
+
+                # Only match fields at the expected sibling indent
+                if findent == entry_indent + 2:
+                    for field in ("path", *extra_fields):
+                        if fstripped.startswith(f"{field}:"):
+                            val = fstripped.split(":", 1)[1].strip()
+                            if val.startswith("["):
+                                entry[field] = _parse_inline_list(fstripped)
+                            elif val in (">", "|", ""):
+                                pass  # block scalar / empty — keep default
+                            else:
+                                entry[field] = val.strip("'\"")
+                            break
+
+                j += 1
+
+            entries.append(entry)
+            i = j
+        else:
+            i += 1
+
+    return entries
+
+
+def _parse_pipelines(text: str) -> dict[str, list[str]]:
+    """Parse the ``pipelines`` section.
+
+    Returns ``{pipeline_name: [template_name, ...]}``.
+    """
+    pipelines: dict[str, list[str]] = {}
+    current: str | None = None
+
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+
+        indent = len(line) - len(line.lstrip())
+
+        # Pipeline name at indent 2
+        if indent == 2 and stripped.endswith(":") and not stripped.startswith("-"):
+            current = stripped[:-1]
+            pipelines[current] = []
+            continue
+
+        if current and stripped.startswith("- template:"):
+            tmpl = stripped.split(":", 1)[1].strip().strip("'\"")
+            pipelines[current].append(tmpl)
+
+    return pipelines
+
+
+# ---------------------------------------------------------------------------
+# File discovery
+# ---------------------------------------------------------------------------
+
+
+def _find_component_files(repo_root: Path) -> set[str]:
+    """Find all ``.md`` files in component directories.
+
+    Returns a set of repo-relative POSIX paths (e.g.
+    ``protocols/guardrails/anti-hallucination.md``).
+    """
+    files: set[str] = set()
+
+    for directory in ("personas", "formats", "taxonomies", "templates"):
+        dir_path = repo_root / directory
+        if dir_path.is_dir():
+            for f in dir_path.glob("*.md"):
+                files.add(f"{directory}/{f.name}")
+
+    # Protocols are nested by category
+    protocols_dir = repo_root / "protocols"
+    if protocols_dir.is_dir():
+        for f in protocols_dir.rglob("*.md"):
+            files.add(f.relative_to(repo_root).as_posix())
+
+    return files
+
+
+# ---------------------------------------------------------------------------
+# Validation checks
+# ---------------------------------------------------------------------------
+
+
+def validate(repo_root: Path) -> list[str]:
+    """Run all graph integrity checks.
+
+    Returns a list of tagged error strings (empty = all checks pass).
+    """
+    manifest_path = repo_root / "manifest.yaml"
+    errors: list[str] = []
+
+    if not manifest_path.exists():
+        return ["[broken-path] manifest.yaml not found"]
+
+    text = manifest_path.read_text(encoding="utf-8")
+    sections = _split_sections(text)
+
+    # Parse each component section
+    personas = _parse_entries(sections.get("personas", ""))
+    protocols = _parse_entries(sections.get("protocols", ""))
+    formats = _parse_entries(sections.get("formats", ""))
+    taxonomies = _parse_entries(sections.get("taxonomies", ""))
+    templates = _parse_entries(
+        sections.get("templates", ""),
+        extra_fields=("persona", "protocols", "format", "taxonomies"),
+    )
+    pipelines = _parse_pipelines(sections.get("pipelines", ""))
+
+    # Build lookup sets
+    persona_names = {p["name"] for p in personas}
+    protocol_names = {p["name"] for p in protocols}
+    format_names = {f["name"] for f in formats}
+    taxonomy_names = {t["name"] for t in taxonomies}
+    template_names = {t["name"] for t in templates}
+
+    all_components = personas + protocols + formats + taxonomies + templates
+    manifest_paths = {str(c["path"]) for c in all_components if c.get("path")}
+
+    # ------------------------------------------------------------------
+    # Check 1: Broken paths — manifest path → file on disk
+    # ------------------------------------------------------------------
+    for component in all_components:
+        path = component.get("path", "")
+        if path and not (repo_root / str(path)).exists():
+            errors.append(
+                f"[broken-path] {component['name']}: "
+                f"path '{path}' does not exist"
+            )
+
+    # ------------------------------------------------------------------
+    # Check 2: Broken references — template refs → manifest entries
+    # ------------------------------------------------------------------
+    for tmpl in templates:
+        name = tmpl["name"]
+
+        persona = tmpl.get("persona", "")
+        if persona and persona != "configurable" and persona not in persona_names:
+            errors.append(
+                f"[broken-ref] template '{name}': "
+                f"persona '{persona}' not found in manifest"
+            )
+
+        protos = tmpl.get("protocols", [])
+        if isinstance(protos, list):
+            for proto in protos:
+                if proto not in protocol_names:
+                    errors.append(
+                        f"[broken-ref] template '{name}': "
+                        f"protocol '{proto}' not found in manifest"
+                    )
+
+        fmt = tmpl.get("format", "")
+        if fmt and fmt not in format_names:
+            errors.append(
+                f"[broken-ref] template '{name}': "
+                f"format '{fmt}' not found in manifest"
+            )
+
+        taxes = tmpl.get("taxonomies", [])
+        if isinstance(taxes, list):
+            for tax in taxes:
+                if tax not in taxonomy_names:
+                    errors.append(
+                        f"[broken-ref] template '{name}': "
+                        f"taxonomy '{tax}' not found in manifest"
+                    )
+
+    # ------------------------------------------------------------------
+    # Check 3: Orphaned components — files not in manifest
+    # ------------------------------------------------------------------
+    actual_files = _find_component_files(repo_root)
+    for orphan in sorted(actual_files - manifest_paths):
+        errors.append(f"[orphan] {orphan}: exists on disk but not in manifest")
+
+    # ------------------------------------------------------------------
+    # Check 4: Missing companion components
+    #
+    # Persona and protocols are always required.  Format may be null
+    # for templates that define their output structure inline (e.g.
+    # generate-commit-message), so its absence is not an error.
+    # ------------------------------------------------------------------
+    for tmpl in templates:
+        name = tmpl["name"]
+        if not tmpl.get("persona"):
+            errors.append(f"[missing-companion] template '{name}': no persona")
+        if not tmpl.get("protocols"):
+            errors.append(f"[missing-companion] template '{name}': no protocols")
+
+    # ------------------------------------------------------------------
+    # Check 5: Pipeline integrity — stage templates exist
+    # ------------------------------------------------------------------
+    for pipeline_name, stage_templates in pipelines.items():
+        for tmpl in stage_templates:
+            if tmpl not in template_names:
+                errors.append(
+                    f"[broken-pipeline] pipeline '{pipeline_name}': "
+                    f"stage template '{tmpl}' not found in manifest"
+                )
+
+    return errors
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+
+def main() -> int:
+    """Run checks and print a structured report."""
+    if len(sys.argv) > 1:
+        repo_root = Path(sys.argv[1])
+    else:
+        repo_root = Path(__file__).resolve().parent.parent
+
+    errors = validate(repo_root)
+
+    if errors:
+        # Group errors by check type for readability
+        by_type: dict[str, list[str]] = {}
+        for err in errors:
+            m = re.match(r"\[([^\]]+)]", err)
+            tag = m.group(1) if m else "other"
+            by_type.setdefault(tag, []).append(err)
+
+        print(f"FAIL: graph integrity check found {len(errors)} issue(s):\n")
+        for tag, errs in by_type.items():
+            print(f"  [{tag}] ({len(errs)}):")
+            for err in errs:
+                msg = re.sub(r"^\[[^\]]+]\s*", "", err)
+                print(f"    - {msg}")
+            print()
+        return 1
+
+    print("OK: all graph integrity checks passed.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 4828e5559749edc8f8236ef409c36de2a14b3c9b Mon Sep 17 00:00:00 2001
From: Alan Jowett <alan.jowett@microsoft.com>
Date: Sun, 29 Mar 2026 19:28:06 -0700
Subject: [PATCH 2/3] Address review: fix docstring and init list fields with
 []

- Update docstring to reflect that format is intentionally optional
- Initialize list-valued extra fields (protocols, taxonomies) with []
  instead of empty string for type consistency

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/validate-graph-integrity.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/validate-graph-integrity.py b/tests/validate-graph-integrity.py
index a2e4826..2c230cf 100644
--- a/tests/validate-graph-integrity.py
+++ b/tests/validate-graph-integrity.py
@@ -13,7 +13,7 @@
   2. Broken references — template persona/protocol/format/taxonomy refs
                          resolve to entries in the manifest
   3. Orphaned files    — component files on disk not listed in the manifest
-  4. Missing companions — templates lacking required persona/protocol/format
+  4. Missing companions — templates lacking required persona and/or protocols
   5. Pipeline integrity — pipeline stage templates exist in the manifest
 
 Exit code 0 = all checks pass.
@@ -86,10 +86,10 @@ def _parse_entries(
                 "name": stripped.split(":", 1)[1].strip().strip("'\""),
                 "path": "",
             }
-            # Initialise list-valued fields so callers can iterate safely
+            # Initialise extra fields with appropriate default types
             for f in extra_fields:
                 if f not in entry:
-                    entry[f] = ""
+                    entry[f] = [] if f in ("protocols", "taxonomies") else ""
 
             j = i + 1
             while j < len(lines):

From 1c9ea91f6f10492d36febb59e508b329a2273ab5 Mon Sep 17 00:00:00 2001
From: Alan Jowett <alan.jowett@microsoft.com>
Date: Sun, 29 Mar 2026 19:40:22 -0700
Subject: [PATCH 3/3] Address review: frontmatter validation + strict path
 check

- Add Check 6: parse template file frontmatter and validate persona,
  protocol, format, and taxonomy references against manifest entries.
  Protocol paths are normalized to short names. Handles configurable
  personas and template variables.
- Use .is_file() instead of .exists() for broken-path detection to
  reject directories that accidentally match a manifest path.
- Frontmatter parser only matches top-level fields (indent 0) to avoid
  false matches inside nested blocks like params.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/validate-graph-integrity.py | 132 +++++++++++++++++++++++++++++-
 1 file changed, 131 insertions(+), 1 deletion(-)

diff --git a/tests/validate-graph-integrity.py b/tests/validate-graph-integrity.py
index 2c230cf..2c27577 100644
--- a/tests/validate-graph-integrity.py
+++ b/tests/validate-graph-integrity.py
@@ -15,6 +15,8 @@
   3. Orphaned files    — component files on disk not listed in the manifest
   4. Missing companions — templates lacking required persona and/or protocols
   5. Pipeline integrity — pipeline stage templates exist in the manifest
+  6. Frontmatter refs  — template file frontmatter references resolve to
+                         manifest entries (cross-checks the actual files)
 
 Exit code 0 = all checks pass.
 Exit code 1 = one or more issues detected.
@@ -39,6 +41,81 @@ def _parse_inline_list(text: str) -> list[str]:
     return []
 
 
+def _protocol_short_name(full_path: str) -> str:
+    """Extract the short protocol name from a category/name path.
+
+    E.g. ``'guardrails/anti-hallucination'`` → ``'anti-hallucination'``
+    """
+    return full_path.rsplit("/", 1)[-1]
+
+
+def _parse_template_frontmatter(text: str) -> dict[str, object] | None:
+    """Extract key fields from a template file's YAML frontmatter.
+
+    Returns a dict with ``persona``, ``protocols``, ``format``, and
+    ``taxonomies``, or *None* if no frontmatter block is found.
+    """
+    match = re.search(r"^---\s*\n(.*?)\n---", text, re.DOTALL | re.MULTILINE)
+    if not match:
+        return None
+    block = match.group(1)
+
+    result: dict[str, object] = {
+        "persona": "",
+        "protocols": [],
+        "format": "",
+        "taxonomies": [],
+    }
+    current_list_field: str | None = None
+
+    for line in block.splitlines():
+        stripped = line.strip()
+        indent = len(line) - len(line.lstrip())
+
+        # Only match top-level fields (no indentation) to avoid
+        # picking up identically-named keys inside nested blocks
+        # like params, input_contract, or output_contract.
+        if indent > 0:
+            # Still collect multi-line list items at indent 2
+            if current_list_field and stripped.startswith("- "):
+                result[current_list_field].append(
+                    stripped[2:].strip().strip("'\"")
+                )
+            elif stripped and current_list_field and not stripped.startswith("#"):
+                current_list_field = None
+            continue
+
+        # Scalar fields
+        for field in ("persona", "format"):
+            if stripped.startswith(f"{field}:"):
+                val = stripped.split(":", 1)[1].strip().strip("'\"")
+                if val == "null":
+                    val = ""
+                result[field] = val
+                current_list_field = None
+                break
+        else:
+            # List fields (inline or multi-line)
+            for list_field in ("protocols", "taxonomies"):
+                if stripped.startswith(f"{list_field}:"):
+                    inline = re.search(r"\[(.+)]", stripped)
+                    if inline:
+                        result[list_field] = [
+                            item.strip().strip("'\"")
+                            for item in inline.group(1).split(",")
+                        ]
+                        current_list_field = None
+                    else:
+                        current_list_field = list_field
+                    break
+            else:
+                # Any other top-level key ends a multi-line list
+                if current_list_field:
+                    current_list_field = None
+
+    return result
+
+
 def _split_sections(text: str) -> dict[str, str]:
     """Split manifest text into top-level sections by unindented keys.
 
@@ -228,7 +305,7 @@ def validate(repo_root: Path) -> list[str]:
     # ------------------------------------------------------------------
     for component in all_components:
         path = component.get("path", "")
-        if path and not (repo_root / str(path)).exists():
+        if path and not (repo_root / str(path)).is_file():
             errors.append(
                 f"[broken-path] {component['name']}: "
                 f"path '{path}' does not exist"
@@ -304,6 +381,59 @@ def validate(repo_root: Path) -> list[str]:
                     f"stage template '{tmpl}' not found in manifest"
                 )
 
+    # ------------------------------------------------------------------
+    # Check 6: Template frontmatter references → manifest entries
+    #
+    # Parse each template file's YAML frontmatter and validate that
+    # persona, protocol, format, and taxonomy references resolve to
+    # entries in the manifest.  Protocol paths are normalized to short
+    # names (e.g. 'guardrails/anti-hallucination' → 'anti-hallucination').
+    # ------------------------------------------------------------------
+    templates_dir = repo_root / "templates"
+    if templates_dir.is_dir():
+        for tmpl_file in sorted(templates_dir.glob("*.md")):
+            tmpl_text = tmpl_file.read_text(encoding="utf-8")
+            fm = _parse_template_frontmatter(tmpl_text)
+            if fm is None:
+                continue
+
+            fname = tmpl_file.stem
+
+            persona = fm.get("persona", "")
+            # Allow configurable/template-variable personas
+            if (
+                persona
+                and persona != "configurable"
+                and "{{" not in str(persona)
+                and persona not in persona_names
+            ):
+                errors.append(
+                    f"[broken-ref-frontmatter] {fname}: "
+                    f"persona '{persona}' not in manifest"
+                )
+
+            for proto in fm.get("protocols", []):
+                short = _protocol_short_name(str(proto))
+                if short not in protocol_names:
+                    errors.append(
+                        f"[broken-ref-frontmatter] {fname}: "
+                        f"protocol '{proto}' not in manifest"
+                    )
+
+            fmt = fm.get("format", "")
+            if fmt and "{{" not in str(fmt) and fmt not in format_names:
+                errors.append(
+                    f"[broken-ref-frontmatter] {fname}: "
+                    f"format '{fmt}' not in manifest"
+                )
+
+            for tax in fm.get("taxonomies", []):
+                if tax not in taxonomy_names:
+                    errors.append(
+                        f"[broken-ref-frontmatter] {fname}: "
+                        f"taxonomy '{tax}' not in manifest"
+                    )
+
     return errors