-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Add PageIndexClient with agent-based retrieval via OpenAI Agents SDK #125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
KylinMountain
wants to merge
7
commits into
VectifyAI:main
Choose a base branch
from
KylinMountain:feat/retrieve
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
3fb4c74
Add PageIndexClient with storage persistence and streaming support
KylinMountain cc4c5fc
Rename go_deeper to explore in retrieve prompt for clarity
KylinMountain 94c8838
refactor to tools and agent
KylinMountain 75a5839
Remove PDF from repo; download at runtime; add verbose tool logging
KylinMountain 239e5e8
remove test mock client
KylinMountain 3cf7d21
resolve review comments
KylinMountain 58a61f6
Fix critical and important robustness issues in retrieve and client
KylinMountain File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,16 @@ | ||
| from .page_index import * | ||
| from .page_index_md import md_to_tree | ||
| from .page_index_md import md_to_tree | ||
| from .retrieve import tool_get_document, tool_get_document_structure, tool_get_page_content | ||
|
|
||
| try: | ||
| from .client import PageIndexClient | ||
| except ImportError as _e: | ||
| _import_error_msg = str(_e) | ||
|
|
||
| class PageIndexClient: # type: ignore[no-redef] | ||
| def __init__(self, *args, **kwargs): | ||
| raise ImportError( | ||
| "PageIndexClient requires 'openai-agents'. " | ||
| "Install it with: pip install openai-agents\n" | ||
| f"(Original error: {_import_error_msg})" | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,214 @@ | ||
| import os | ||
| import uuid | ||
| import json | ||
| import asyncio | ||
| import concurrent.futures | ||
| from pathlib import Path | ||
| from agents import Agent, Runner, function_tool | ||
| from agents.stream_events import RunItemStreamEvent | ||
|
|
||
| from .page_index import page_index | ||
| from .page_index_md import md_to_tree | ||
| from .retrieve import tool_get_document, tool_get_document_structure, tool_get_page_content | ||
|
|
||
| AGENT_SYSTEM_PROMPT = """ | ||
| You are PageIndex, a document QA assistant. | ||
| TOOL USE: | ||
| - Call get_document() first to confirm status and page/line count. | ||
| - Call get_document_structure() to find relevant page ranges (use node summaries and start_index/end_index). | ||
| - Call get_page_content(pages="5-7") with tight ranges. Never fetch the whole doc. | ||
| - For Markdown, pages = line numbers from the structure (the line_num field). Use line_count from get_document() as the upper bound. | ||
| ANSWERING: Answer based only on tool output. Be concise. | ||
| """ | ||
|
|
||
|
|
||
| class PageIndexClient: | ||
| """ | ||
| A client for the PageIndex API. | ||
| Uses an OpenAI Agents SDK agent with 3 tools to answer document questions. | ||
| Flow: Index -> query_agent (tool-use loop) -> Answer | ||
| """ | ||
| def __init__(self, api_key: str = None, model: str = "gpt-4o-2024-11-20", workspace: str = None): | ||
| self.api_key = api_key or os.getenv("CHATGPT_API_KEY") | ||
| if self.api_key: | ||
| os.environ["CHATGPT_API_KEY"] = self.api_key | ||
| os.environ["OPENAI_API_KEY"] = self.api_key | ||
| self.model = model | ||
| self.workspace = Path(workspace).expanduser() if workspace else None | ||
| if self.workspace: | ||
| self.workspace.mkdir(parents=True, exist_ok=True) | ||
| self.documents = {} | ||
| if self.workspace: | ||
| self._load_workspace() | ||
|
|
||
| def index(self, file_path: str, mode: str = "auto") -> str: | ||
| """Upload and index a document. Returns a document_id.""" | ||
| if not os.path.exists(file_path): | ||
| raise FileNotFoundError(f"File not found: {file_path}") | ||
|
|
||
| doc_id = str(uuid.uuid4()) | ||
| ext = os.path.splitext(file_path)[1].lower() | ||
|
|
||
| is_pdf = ext == '.pdf' | ||
| is_md = ext in ['.md', '.markdown'] | ||
|
|
||
| if mode == "pdf" or (mode == "auto" and is_pdf): | ||
| print(f"Indexing PDF: {file_path}") | ||
| result = page_index( | ||
| doc=file_path, | ||
| model=self.model, | ||
| if_add_node_summary='yes', | ||
| if_add_node_text='yes', | ||
| if_add_node_id='yes', | ||
| if_add_doc_description='yes' | ||
| ) | ||
| self.documents[doc_id] = { | ||
| 'id': doc_id, | ||
| 'path': file_path, | ||
| 'type': 'pdf', | ||
| 'structure': result['structure'], | ||
| 'doc_name': result.get('doc_name', ''), | ||
| 'doc_description': result.get('doc_description', '') | ||
| } | ||
|
|
||
| elif mode == "md" or (mode == "auto" and is_md): | ||
| print(f"Indexing Markdown: {file_path}") | ||
| result = asyncio.run(md_to_tree( | ||
| md_path=file_path, | ||
| if_thinning=False, | ||
| if_add_node_summary='yes', | ||
| summary_token_threshold=200, | ||
| model=self.model, | ||
| if_add_doc_description='yes', | ||
| if_add_node_text='yes', | ||
| if_add_node_id='yes' | ||
| )) | ||
| self.documents[doc_id] = { | ||
| 'id': doc_id, | ||
| 'path': file_path, | ||
| 'type': 'md', | ||
| 'structure': result['structure'], | ||
| 'doc_name': result.get('doc_name', ''), | ||
| 'doc_description': result.get('doc_description', '') | ||
| } | ||
| else: | ||
| raise ValueError(f"Unsupported file format for: {file_path}") | ||
|
|
||
| print(f"Indexing complete. Document ID: {doc_id}") | ||
| if self.workspace: | ||
| self._save_doc(doc_id) | ||
| return doc_id | ||
|
|
||
| def _save_doc(self, doc_id: str): | ||
| path = self.workspace / f"{doc_id}.json" | ||
| with open(path, "w", encoding="utf-8") as f: | ||
| json.dump(self.documents[doc_id], f, ensure_ascii=False, indent=2) | ||
|
|
||
| def _load_workspace(self): | ||
| loaded = 0 | ||
| for path in self.workspace.glob("*.json"): | ||
| try: | ||
| with open(path, "r", encoding="utf-8") as f: | ||
| doc = json.load(f) | ||
| self.documents[path.stem] = doc | ||
| loaded += 1 | ||
| except (json.JSONDecodeError, OSError) as e: | ||
| print(f"Warning: skipping corrupt workspace file {path.name}: {e}") | ||
| if loaded: | ||
| print(f"Loaded {loaded} document(s) from workspace.") | ||
|
|
||
| # ── Public tool methods (thin wrappers) ─────────────────────────────────── | ||
|
|
||
| def get_document(self, doc_id: str) -> str: | ||
| """Return document metadata JSON.""" | ||
| return tool_get_document(self.documents, doc_id) | ||
|
|
||
| def get_document_structure(self, doc_id: str) -> str: | ||
| """Return document tree structure JSON (without text fields).""" | ||
| return tool_get_document_structure(self.documents, doc_id) | ||
|
|
||
| def get_page_content(self, doc_id: str, pages: str) -> str: | ||
| """Return page content JSON for the given pages string (e.g. '5-7', '3,8', '12').""" | ||
| return tool_get_page_content(self.documents, doc_id, pages) | ||
|
|
||
| # ── Agent core ──────────────────────────────────────────────────────────── | ||
|
|
||
| def query_agent(self, doc_id: str, prompt: str, verbose: bool = False) -> str: | ||
| """ | ||
| Run the PageIndex agent for a document query. | ||
| The agent automatically calls get_document, get_document_structure, | ||
| and get_page_content tools as needed to answer the question. | ||
|
|
||
| Args: | ||
| verbose: If True, print each tool call and result as they happen. | ||
| """ | ||
| client_self = self | ||
|
|
||
| @function_tool | ||
| def get_document() -> str: | ||
| """Get document metadata: status, page count, name, and description.""" | ||
| return client_self.get_document(doc_id) | ||
|
|
||
| @function_tool | ||
| def get_document_structure() -> str: | ||
| """Get the document's full tree structure (without text) to find relevant sections.""" | ||
| return client_self.get_document_structure(doc_id) | ||
|
|
||
| @function_tool | ||
| def get_page_content(pages: str) -> str: | ||
| """ | ||
| Get the text content of specific pages or line numbers. | ||
| Use tight ranges: e.g. '5-7' for pages 5 to 7, '3,8' for pages 3 and 8, '12' for page 12. | ||
| For Markdown documents, use line numbers from the structure's line_num field. | ||
| """ | ||
| return client_self.get_page_content(doc_id, pages) | ||
|
|
||
| agent = Agent( | ||
| name="PageIndex", | ||
| instructions=AGENT_SYSTEM_PROMPT, | ||
| tools=[get_document, get_document_structure, get_page_content], | ||
| model=self.model, | ||
| ) | ||
|
|
||
| if not verbose: | ||
| result = Runner.run_sync(agent, prompt) | ||
| return result.final_output | ||
|
|
||
| # verbose mode: stream events and print tool calls | ||
| async def _run_verbose(): | ||
| turn = 0 | ||
| stream = Runner.run_streamed(agent, prompt) | ||
| async for event in stream.stream_events(): | ||
| if not isinstance(event, RunItemStreamEvent): | ||
| continue | ||
| if event.name == "tool_called": | ||
| turn += 1 | ||
| raw = event.item.raw_item | ||
| args = getattr(raw, "arguments", "{}") | ||
| print(f"\n[Turn {turn}] → {raw.name}({args})") | ||
| elif event.name == "tool_output": | ||
| output = str(event.item.output) | ||
| preview = output[:200] + "..." if len(output) > 200 else output | ||
| print(f" ← {preview}") | ||
| return stream.final_output | ||
|
|
||
| try: | ||
| asyncio.get_running_loop() | ||
| # Inside a running event loop (e.g. Jupyter) — run in a thread | ||
| with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: | ||
| return pool.submit(asyncio.run, _run_verbose()).result() | ||
| except RuntimeError: | ||
| return asyncio.run(_run_verbose()) | ||
|
|
||
| # ── Public query API ────────────────────────────────────────────────────── | ||
|
|
||
| def query(self, doc_id: str, prompt: str) -> str: | ||
| """Ask a question about an indexed document. Returns the agent's answer.""" | ||
| return self.query_agent(doc_id, prompt) | ||
|
|
||
| def query_stream(self, doc_id: str, prompt: str): | ||
| """ | ||
| Ask a question about an indexed document with streaming output. | ||
| MVP: yields the full answer as a single chunk. | ||
| """ | ||
| yield self.query_agent(doc_id, prompt) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,137 @@ | ||
| import json | ||
| import PyPDF2 | ||
|
|
||
| try: | ||
| from .utils import get_number_of_pages, remove_fields | ||
| except ImportError: | ||
| from utils import get_number_of_pages, remove_fields | ||
|
|
||
|
|
||
| # ── Helpers ────────────────────────────────────────────────────────────────── | ||
|
|
||
| def _parse_pages(pages: str) -> list[int]: | ||
| """Parse a pages string like '5-7', '3,8', or '12' into a sorted list of ints.""" | ||
| result = [] | ||
| for part in pages.split(','): | ||
| part = part.strip() | ||
| if '-' in part: | ||
| start, end = part.split('-', 1) | ||
| result.extend(range(int(start.strip()), int(end.strip()) + 1)) | ||
| else: | ||
| result.append(int(part)) | ||
KylinMountain marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return sorted(set(result)) | ||
|
|
||
|
|
||
| def _count_pages(doc_info: dict) -> int: | ||
| """Return total page count for a document.""" | ||
| if doc_info.get('type') == 'pdf': | ||
| return get_number_of_pages(doc_info['path']) | ||
| # For MD, find max line_num across all nodes | ||
| max_line = 0 | ||
| def _traverse(nodes): | ||
| nonlocal max_line | ||
| for node in nodes: | ||
| ln = node.get('line_num', 0) | ||
| if ln and ln > max_line: | ||
| max_line = ln | ||
| if node.get('nodes'): | ||
| _traverse(node['nodes']) | ||
| _traverse(doc_info.get('structure', [])) | ||
| return max_line | ||
|
|
||
|
|
||
| def _get_pdf_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: | ||
| """Extract text for specific PDF pages (1-indexed), opening the PDF once.""" | ||
| path = doc_info['path'] | ||
| with open(path, 'rb') as f: | ||
| pdf_reader = PyPDF2.PdfReader(f) | ||
| total = len(pdf_reader.pages) | ||
| valid_pages = [p for p in page_nums if 1 <= p <= total] | ||
| return [ | ||
| {'page': p, 'content': pdf_reader.pages[p - 1].extract_text() or ''} | ||
| for p in valid_pages | ||
| ] | ||
|
|
||
|
|
||
| def _get_md_page_content(doc_info: dict, page_nums: list[int]) -> list[dict]: | ||
| """ | ||
| For Markdown documents, 'pages' are line numbers. | ||
| Find nodes whose line_num falls within the requested set and return their text. | ||
| """ | ||
| page_set = set(page_nums) | ||
| results = [] | ||
| seen = set() | ||
|
|
||
| def _traverse(nodes): | ||
| for node in nodes: | ||
| ln = node.get('line_num') | ||
| if ln and ln in page_set and ln not in seen: | ||
| seen.add(ln) | ||
| results.append({'page': ln, 'content': node.get('text', '')}) | ||
| if node.get('nodes'): | ||
| _traverse(node['nodes']) | ||
|
|
||
| _traverse(doc_info.get('structure', [])) | ||
| results.sort(key=lambda x: x['page']) | ||
| return results | ||
|
|
||
|
|
||
| # ── Tool functions ──────────────────────────────────────────────────────────── | ||
|
|
||
| def tool_get_document(documents: dict, doc_id: str) -> str: | ||
| """Return JSON with document metadata: doc_id, doc_name, doc_description, type, status, page_count (PDF) or line_count (Markdown).""" | ||
| doc_info = documents.get(doc_id) | ||
| if not doc_info: | ||
| return json.dumps({'error': f'Document {doc_id} not found'}) | ||
| result = { | ||
| 'doc_id': doc_id, | ||
| 'doc_name': doc_info.get('doc_name', ''), | ||
| 'doc_description': doc_info.get('doc_description', ''), | ||
| 'type': doc_info.get('type', ''), | ||
| 'status': 'completed', | ||
| } | ||
| if doc_info.get('type') == 'pdf': | ||
| result['page_count'] = _count_pages(doc_info) | ||
| else: | ||
| result['line_count'] = _count_pages(doc_info) | ||
| return json.dumps(result) | ||
|
|
||
|
|
||
| def tool_get_document_structure(documents: dict, doc_id: str) -> str: | ||
| """Return tree structure JSON with text fields removed (saves tokens).""" | ||
| doc_info = documents.get(doc_id) | ||
| if not doc_info: | ||
| return json.dumps({'error': f'Document {doc_id} not found'}) | ||
| structure = doc_info.get('structure', []) | ||
| structure_no_text = remove_fields(structure, fields=['text']) | ||
| return json.dumps(structure_no_text, ensure_ascii=False) | ||
|
|
||
|
|
||
| def tool_get_page_content(documents: dict, doc_id: str, pages: str) -> str: | ||
| """ | ||
| Retrieve page content for a document. | ||
|
|
||
| pages format: '5-7', '3,8', or '12' | ||
| For PDF: pages are physical page numbers (1-indexed). | ||
| For Markdown: pages are line numbers corresponding to node headers. | ||
|
|
||
| Returns JSON list of {'page': int, 'content': str}. | ||
| """ | ||
| doc_info = documents.get(doc_id) | ||
| if not doc_info: | ||
| return json.dumps({'error': f'Document {doc_id} not found'}) | ||
|
|
||
| try: | ||
| page_nums = _parse_pages(pages) | ||
| except (ValueError, AttributeError) as e: | ||
| return json.dumps({'error': f'Invalid pages format: {pages!r}. Use "5-7", "3,8", or "12". Error: {e}'}) | ||
|
|
||
| try: | ||
| if doc_info.get('type') == 'pdf': | ||
| content = _get_pdf_page_content(doc_info, page_nums) | ||
| else: | ||
| content = _get_md_page_content(doc_info, page_nums) | ||
| except Exception as e: | ||
| return json.dumps({'error': f'Failed to read page content: {e}'}) | ||
|
|
||
| return json.dumps(content, ensure_ascii=False) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.