Add URL and content ingestion (Milestone 6)

- Add URL fetching with HTML-to-text extraction - Add basic PDF text extraction - Add smart content chunking with overlap - Add deduplication via content checksums - Add auto-linking to semantically related nodes - Add CLI commands: ingest, clip - Add MCP tools: memory_ingest, memory_clip
2026-02-03 11:00:28 +01:00
parent 67b1e3b481
commit c65a5bb03a
6 changed files with 655 additions and 0 deletions
--- a/src/mcp/index.ts
+++ b/src/mcp/index.ts
@@ -650,6 +650,56 @@ server.tool(
  }
 );

+// --- memory_ingest ---
+import { ingest } from '../core/ingest';
+
+server.tool(
+  'memory_ingest',
+  'Ingest content from a URL or text into the knowledge graph',
+  {
+    source: z.string().describe('URL or raw text to ingest'),
+    title: z.string().optional().describe('Override title'),
+    tags: z.array(z.string()).optional().describe('Tags to apply'),
+    isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'),
+  },
+  async ({ source, title, tags, isUrl }) => {
+    // If explicitly not a URL, or doesn't look like a URL, treat as raw text
+    const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://'));
+
+    if (!isSourceUrl) {
+      // Treat as raw text - create a simple memory node
+      const node = await addNode({
+        kind: 'memory',
+        title: title || 'Ingested Content',
+        content: source,
+        tags: ['ingested', 'text', ...(tags || [])],
+        metadata: { source: { type: 'text', ingestedAt: Date.now() } },
+      });
+      return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] };
+    }
+
+    const result = await ingest(source, { title, tags });
+    return { content: [{ type: 'text' as const, text: serialize(result) }] };
+  }
+);
+
+server.tool(
+  'memory_clip',
+  'Quick clip a URL into memory',
+  {
+    url: z.string().describe('URL to clip'),
+    title: z.string().optional().describe('Override title'),
+    tags: z.array(z.string()).optional().describe('Tags to apply'),
+  },
+  async ({ url, title, tags }) => {
+    if (!url.startsWith('http://') && !url.startsWith('https://')) {
+      return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true };
+    }
+    const result = await ingest(url, { title, tags });
+    return { content: [{ type: 'text' as const, text: serialize(result) }] };
+  }
+);
+
 // --- memory_index ---
 import { indexProject } from '../core/indexer';