Add URL and content ingestion (Milestone 6)

- Add URL fetching with HTML-to-text extraction
- Add basic PDF text extraction
- Add smart content chunking with overlap
- Add deduplication via content checksums
- Add auto-linking to semantically related nodes
- Add CLI commands: ingest, clip
- Add MCP tools: memory_ingest, memory_clip
This commit is contained in:
2026-02-03 11:00:28 +01:00
parent 67b1e3b481
commit c65a5bb03a
6 changed files with 655 additions and 0 deletions

View File

@@ -650,6 +650,56 @@ server.tool(
}
);
// --- memory_ingest ---
import { ingest } from '../core/ingest';
server.tool(
'memory_ingest',
'Ingest content from a URL or text into the knowledge graph',
{
source: z.string().describe('URL or raw text to ingest'),
title: z.string().optional().describe('Override title'),
tags: z.array(z.string()).optional().describe('Tags to apply'),
isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'),
},
async ({ source, title, tags, isUrl }) => {
// If explicitly not a URL, or doesn't look like a URL, treat as raw text
const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://'));
if (!isSourceUrl) {
// Treat as raw text - create a simple memory node
const node = await addNode({
kind: 'memory',
title: title || 'Ingested Content',
content: source,
tags: ['ingested', 'text', ...(tags || [])],
metadata: { source: { type: 'text', ingestedAt: Date.now() } },
});
return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] };
}
const result = await ingest(source, { title, tags });
return { content: [{ type: 'text' as const, text: serialize(result) }] };
}
);
server.tool(
'memory_clip',
'Quick clip a URL into memory',
{
url: z.string().describe('URL to clip'),
title: z.string().optional().describe('Override title'),
tags: z.array(z.string()).optional().describe('Tags to apply'),
},
async ({ url, title, tags }) => {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true };
}
const result = await ingest(url, { title, tags });
return { content: [{ type: 'text' as const, text: serialize(result) }] };
}
);
// --- memory_index ---
import { indexProject } from '../core/indexer';