Add URL and content ingestion (Milestone 6)

- Add URL fetching with HTML-to-text extraction
- Add basic PDF text extraction
- Add smart content chunking with overlap
- Add deduplication via content checksums
- Add auto-linking to semantically related nodes
- Add CLI commands: ingest, clip
- Add MCP tools: memory_ingest, memory_clip
This commit is contained in:
2026-02-03 11:00:28 +01:00
parent 67b1e3b481
commit c65a5bb03a
6 changed files with 655 additions and 0 deletions

View File

@@ -0,0 +1,95 @@
import { Command } from 'commander';
import chalk from 'chalk';
import { ingest } from '../../core/ingest';
export const ingestCommand = new Command('ingest')
.description('Ingest content from URLs, files, or stdin into the knowledge graph')
.argument('[source]', 'URL or file path to ingest')
.option('-t, --title <title>', 'Override title')
.option('--tags <tags>', 'Tags to apply (comma-separated)')
.option('--stdin', 'Read content from stdin')
.option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)')
.option('--no-link', 'Skip auto-linking to related nodes')
.action(async (source: string | undefined, opts) => {
try {
if (!source && !opts.stdin) {
console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
process.exit(1);
}
if (opts.stdin) {
console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
} else {
console.log(chalk.cyan(`Ingesting: ${source}`));
}
const result = await ingest(source || '', {
title: opts.title,
tags: opts.tags?.split(',').map((t: string) => t.trim()),
stdin: opts.stdin,
noLink: !opts.link,
chunkStrategy: opts.chunkSize ? {
maxTokens: parseInt(opts.chunkSize),
} : undefined,
});
if (!result.success) {
console.log(chalk.yellow('Content already exists (duplicate checksum)'));
return;
}
console.log();
console.log(chalk.green(`✓ Ingested: ${result.title}`));
console.log();
console.log(` Type: ${result.sourceType}`);
console.log(` Nodes: ${result.nodeCount}`);
if (result.parentId) {
console.log(` Parent: ${result.parentId.slice(0, 8)}`);
}
for (const node of result.nodes.slice(0, 5)) {
console.log(chalk.dim(` - ${node.id.slice(0, 8)} ${node.title}`));
}
if (result.nodes.length > 5) {
console.log(chalk.dim(` ... and ${result.nodes.length - 5} more`));
}
} catch (err: any) {
console.error(chalk.red(`Error: ${err.message}`));
process.exit(1);
}
});
// Alias for quick URL clipping
export const clipCommand = new Command('clip')
.description('Quick clip a URL (alias for ingest)')
.argument('<url>', 'URL to clip')
.option('-t, --title <title>', 'Override title')
.option('--tags <tags>', 'Tags to apply (comma-separated)')
.action(async (url: string, opts) => {
try {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
console.error(chalk.red('Error: clip expects a URL'));
process.exit(1);
}
console.log(chalk.cyan(`Clipping: ${url}`));
const result = await ingest(url, {
title: opts.title,
tags: opts.tags?.split(',').map((t: string) => t.trim()),
});
if (!result.success) {
console.log(chalk.yellow('Already clipped (duplicate)'));
return;
}
console.log(chalk.green(`${result.title}`));
console.log(chalk.dim(` ${result.nodes[0].id.slice(0, 8)}`));
} catch (err: any) {
console.error(chalk.red(`Error: ${err.message}`));
process.exit(1);
}
});

View File

@@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca
import { contextCommand, contextHookCommand } from './commands/context';
import { indexCommand } from './commands/index-cmd';
import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
import { ingestCommand, clipCommand } from './commands/ingest';
import { closeDb } from '../core/db';
const program = new Command();
@@ -50,6 +51,8 @@ program.addCommand(indexCommand);
program.addCommand(journalCommand);
program.addCommand(journalAliasCommand);
program.addCommand(quickCaptureCommand);
program.addCommand(ingestCommand);
program.addCommand(clipCommand);
program.hook('postAction', () => {
closeDb();

View File

@@ -0,0 +1,93 @@
export interface ChunkOptions {
maxTokens?: number;
overlap?: number;
splitOn?: 'paragraph' | 'sentence' | 'heading';
}
export interface Chunk {
index: number;
content: string;
tokenEstimate: number;
}
const DEFAULT_MAX_TOKENS = 1000;
const DEFAULT_OVERLAP = 100;
const CHARS_PER_TOKEN = 4; // Rough estimate
export function chunkContent(content: string, options: ChunkOptions = {}): Chunk[] {
const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
const overlap = options.overlap || DEFAULT_OVERLAP;
const splitOn = options.splitOn || 'paragraph';
const maxChars = maxTokens * CHARS_PER_TOKEN;
const overlapChars = overlap * CHARS_PER_TOKEN;
// If content is small enough, return as single chunk
if (content.length <= maxChars) {
return [{
index: 0,
content,
tokenEstimate: Math.ceil(content.length / CHARS_PER_TOKEN),
}];
}
// Split into segments based on strategy
const segments = splitContent(content, splitOn);
// Combine segments into chunks
const chunks: Chunk[] = [];
let currentChunk = '';
let chunkIndex = 0;
for (const segment of segments) {
if (currentChunk.length + segment.length > maxChars && currentChunk.length > 0) {
// Save current chunk
chunks.push({
index: chunkIndex++,
content: currentChunk.trim(),
tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
});
// Start new chunk with overlap
if (overlapChars > 0 && currentChunk.length > overlapChars) {
currentChunk = currentChunk.slice(-overlapChars) + segment;
} else {
currentChunk = segment;
}
} else {
currentChunk += segment;
}
}
// Don't forget the last chunk
if (currentChunk.trim()) {
chunks.push({
index: chunkIndex,
content: currentChunk.trim(),
tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
});
}
return chunks;
}
function splitContent(content: string, strategy: 'paragraph' | 'sentence' | 'heading'): string[] {
switch (strategy) {
case 'heading':
// Split on markdown headings
return content.split(/(?=^#{1,6}\s)/m).filter(s => s.trim());
case 'sentence':
// Split on sentence boundaries
return content.split(/(?<=[.!?])\s+/).filter(s => s.trim());
case 'paragraph':
default:
// Split on double newlines (paragraphs)
return content.split(/\n\n+/).filter(s => s.trim()).map(s => s + '\n\n');
}
}
export function estimateTokens(text: string): number {
return Math.ceil(text.length / CHARS_PER_TOKEN);
}

252
src/core/ingest/fetchers.ts Normal file
View File

@@ -0,0 +1,252 @@
import * as fs from 'fs';
import * as path from 'path';
import * as https from 'https';
import * as http from 'http';
export type SourceType = 'url' | 'pdf' | 'markdown' | 'text' | 'html';
export interface FetchedContent {
title: string;
content: string;
sourceType: SourceType;
metadata: Record<string, any>;
}
export function detectSourceType(source: string): SourceType {
if (source.startsWith('http://') || source.startsWith('https://')) {
return 'url';
}
const ext = path.extname(source).toLowerCase();
switch (ext) {
case '.pdf': return 'pdf';
case '.md': return 'markdown';
case '.html': case '.htm': return 'html';
default: return 'text';
}
}
export async function fetchContent(source: string, sourceType: SourceType): Promise<FetchedContent> {
switch (sourceType) {
case 'url':
return fetchUrl(source);
case 'pdf':
return fetchPdf(source);
case 'markdown':
case 'text':
case 'html':
return fetchFile(source, sourceType);
default:
throw new Error(`Unsupported source type: ${sourceType}`);
}
}
async function fetchUrl(url: string): Promise<FetchedContent> {
const html = await httpGet(url);
// Extract title from HTML
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
const title = titleMatch?.[1]?.trim() || new URL(url).hostname;
// Convert HTML to readable text
const content = htmlToText(html);
return {
title,
content,
sourceType: 'url',
metadata: {
url,
fetchedAt: Date.now(),
},
};
}
function httpGet(url: string): Promise<string> {
return new Promise((resolve, reject) => {
const client = url.startsWith('https') ? https : http;
const req = client.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Cortex/1.0)',
'Accept': 'text/html,application/xhtml+xml,text/plain,*/*',
},
timeout: 30000,
}, (res) => {
// Handle redirects
if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
const redirectUrl = new URL(res.headers.location, url).toString();
resolve(httpGet(redirectUrl));
return;
}
if (res.statusCode && res.statusCode >= 400) {
reject(new Error(`HTTP ${res.statusCode}`));
return;
}
let data = '';
res.on('data', chunk => data += chunk);
res.on('end', () => resolve(data));
});
req.on('error', reject);
req.on('timeout', () => {
req.destroy();
reject(new Error('Request timeout'));
});
});
}
function htmlToText(html: string): string {
// Remove scripts, styles, and other non-content elements
let text = html
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '');
// Try to find main content
const mainMatch = text.match(/<main[^>]*>([\s\S]*?)<\/main>/i) ||
text.match(/<article[^>]*>([\s\S]*?)<\/article>/i) ||
text.match(/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
if (mainMatch) {
text = mainMatch[1];
} else {
// Fall back to body
const bodyMatch = text.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
if (bodyMatch) text = bodyMatch[1];
}
// Convert common elements to markdown-ish format
text = text
.replace(/<h1[^>]*>/gi, '\n# ')
.replace(/<\/h1>/gi, '\n')
.replace(/<h2[^>]*>/gi, '\n## ')
.replace(/<\/h2>/gi, '\n')
.replace(/<h3[^>]*>/gi, '\n### ')
.replace(/<\/h3>/gi, '\n')
.replace(/<h[456][^>]*>/gi, '\n#### ')
.replace(/<\/h[456]>/gi, '\n')
.replace(/<p[^>]*>/gi, '\n')
.replace(/<\/p>/gi, '\n')
.replace(/<br\s*\/?>/gi, '\n')
.replace(/<li[^>]*>/gi, '\n- ')
.replace(/<\/li>/gi, '')
.replace(/<[^>]+>/g, ' ') // Remove remaining tags
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/\n{3,}/g, '\n\n') // Collapse multiple newlines
.replace(/[ \t]+/g, ' ') // Collapse spaces
.trim();
return text;
}
async function fetchPdf(filePath: string): Promise<FetchedContent> {
// Basic PDF text extraction (very simple - just looks for text streams)
// For production, you'd want pdf-parse or pdfjs-dist
const buffer = fs.readFileSync(filePath);
const content = extractPdfText(buffer);
return {
title: path.basename(filePath, '.pdf'),
content: content || '[PDF content could not be extracted. Install pdf-parse for better support.]',
sourceType: 'pdf',
metadata: {
filePath,
size: buffer.length,
fetchedAt: Date.now(),
},
};
}
function extractPdfText(buffer: Buffer): string {
// Very basic PDF text extraction
// Looks for text between BT and ET markers, handles some encoding
const str = buffer.toString('binary');
const texts: string[] = [];
// Find text objects
const textRegex = /BT[\s\S]*?ET/g;
let match;
while ((match = textRegex.exec(str)) !== null) {
const block = match[0];
// Extract text from Tj and TJ operators
const tjMatches = block.match(/\(([^)]*)\)\s*Tj/g) || [];
for (const tj of tjMatches) {
const textMatch = tj.match(/\(([^)]*)\)/);
if (textMatch) {
texts.push(textMatch[1]);
}
}
}
// Also try to find raw text streams
const streamRegex = /stream\s*([\s\S]*?)\s*endstream/g;
while ((match = streamRegex.exec(str)) !== null) {
const decoded = match[1].replace(/[^\x20-\x7E\n\r\t]/g, ' ').trim();
if (decoded.length > 50 && /[a-zA-Z]{3,}/.test(decoded)) {
texts.push(decoded);
}
}
return texts.join('\n').replace(/\s+/g, ' ').trim();
}
async function fetchFile(filePath: string, sourceType: SourceType): Promise<FetchedContent> {
const content = fs.readFileSync(filePath, 'utf-8');
const title = path.basename(filePath, path.extname(filePath));
let processedContent = content;
if (sourceType === 'html') {
processedContent = htmlToText(content);
}
return {
title,
content: processedContent,
sourceType,
metadata: {
filePath,
fetchedAt: Date.now(),
},
};
}
export async function fetchFromClipboard(): Promise<FetchedContent> {
// This is platform-specific and won't work in all environments
// For standalone builds, we'd need native bindings
throw new Error('Clipboard support requires platform-specific implementation');
}
export async function fetchFromStdin(): Promise<FetchedContent> {
return new Promise((resolve, reject) => {
let data = '';
process.stdin.setEncoding('utf8');
process.stdin.on('data', chunk => data += chunk);
process.stdin.on('end', () => {
resolve({
title: 'Stdin Input',
content: data.trim(),
sourceType: 'text',
metadata: {
source: 'stdin',
fetchedAt: Date.now(),
},
});
});
process.stdin.on('error', reject);
// Timeout for stdin
setTimeout(() => {
if (!data) {
reject(new Error('No input received from stdin'));
}
}, 5000);
});
}

162
src/core/ingest/index.ts Normal file
View File

@@ -0,0 +1,162 @@
import * as crypto from 'crypto';
import { addNode, addEdge, listNodes, query } from '../store';
import { Node } from '../../types';
import { detectSourceType, fetchContent, fetchFromStdin, FetchedContent, SourceType } from './fetchers';
import { chunkContent, ChunkOptions, Chunk } from './chunker';
export interface IngestOptions {
title?: string;
tags?: string[];
chunkStrategy?: ChunkOptions;
noLink?: boolean;
stdin?: boolean;
}
export interface IngestResult {
success: boolean;
sourceType: SourceType;
title: string;
nodeCount: number;
nodes: { id: string; title: string }[];
parentId?: string;
}
export async function ingest(source: string, options: IngestOptions = {}): Promise<IngestResult> {
// Fetch content
let fetched: FetchedContent;
if (options.stdin) {
fetched = await fetchFromStdin();
} else {
const sourceType = detectSourceType(source);
fetched = await fetchContent(source, sourceType);
}
const title = options.title || fetched.title;
const tags = options.tags || [];
// Compute checksum for deduplication
const checksum = crypto.createHash('md5').update(fetched.content).digest('hex');
// Check for duplicates
const existingDupe = listNodes({ kind: 'memory', tags: ['ingested'] })
.find(n => (n.metadata as any)?.source?.checksum === checksum);
if (existingDupe) {
return {
success: false,
sourceType: fetched.sourceType,
title,
nodeCount: 0,
nodes: [],
};
}
// Chunk content if needed
const chunks = chunkContent(fetched.content, options.chunkStrategy);
const nodes: Node[] = [];
if (chunks.length === 1) {
// Single node
const node = await addNode({
kind: 'memory',
title,
content: chunks[0].content,
tags: ['ingested', fetched.sourceType, ...tags],
metadata: {
source: {
type: fetched.sourceType,
...fetched.metadata,
checksum,
},
tokenEstimate: chunks[0].tokenEstimate,
},
});
nodes.push(node);
} else {
// Create parent node with summary
const summaryContent = `Ingested content from ${fetched.sourceType} source.\n\n` +
`**Source:** ${fetched.metadata.url || fetched.metadata.filePath || 'stdin'}\n` +
`**Chunks:** ${chunks.length}\n` +
`**Total tokens:** ~${chunks.reduce((sum, c) => sum + c.tokenEstimate, 0)}\n\n` +
`## Preview\n\n${fetched.content.slice(0, 500)}...`;
const parentNode = await addNode({
kind: 'memory',
title,
content: summaryContent,
tags: ['ingested', fetched.sourceType, 'parent', ...tags],
metadata: {
source: {
type: fetched.sourceType,
...fetched.metadata,
checksum,
},
chunkCount: chunks.length,
},
});
nodes.push(parentNode);
// Create child nodes for each chunk
for (const chunk of chunks) {
const chunkNode = await addNode({
kind: 'memory',
title: `${title} (Part ${chunk.index + 1}/${chunks.length})`,
content: chunk.content,
tags: ['ingested', fetched.sourceType, 'chunk', ...tags],
metadata: {
parentId: parentNode.id,
chunkIndex: chunk.index,
tokenEstimate: chunk.tokenEstimate,
},
});
nodes.push(chunkNode);
addEdge(parentNode.id, chunkNode.id, 'contains');
}
}
// Auto-link to related nodes (if not disabled)
if (!options.noLink) {
for (const node of nodes) {
await linkRelatedNodes(node);
}
}
return {
success: true,
sourceType: fetched.sourceType,
title,
nodeCount: nodes.length,
nodes: nodes.map(n => ({ id: n.id, title: n.title })),
parentId: chunks.length > 1 ? nodes[0].id : undefined,
};
}
async function linkRelatedNodes(node: Node): Promise<void> {
// Search for related nodes based on content
const searchText = node.title + ' ' + node.content.slice(0, 500);
try {
const related = await query(searchText, { limit: 5 });
for (const result of related) {
// Don't link to self or siblings from same ingestion
if (result.node.id === node.id) continue;
if ((result.node.metadata as any)?.source?.checksum === (node.metadata as any)?.source?.checksum) continue;
// Only link if relevance is high enough
if (result.score > 0.5) {
try {
addEdge(node.id, result.node.id, 'relates_to', { reason: 'semantic-similarity', score: result.score });
} catch {
// Edge might already exist
}
}
}
} catch {
// Search might fail, that's okay
}
}
export { detectSourceType, fetchContent, fetchFromStdin, SourceType } from './fetchers';
export { chunkContent, estimateTokens, ChunkOptions, Chunk } from './chunker';

View File

@@ -650,6 +650,56 @@ server.tool(
}
);
// --- memory_ingest ---
import { ingest } from '../core/ingest';
server.tool(
'memory_ingest',
'Ingest content from a URL or text into the knowledge graph',
{
source: z.string().describe('URL or raw text to ingest'),
title: z.string().optional().describe('Override title'),
tags: z.array(z.string()).optional().describe('Tags to apply'),
isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'),
},
async ({ source, title, tags, isUrl }) => {
// If explicitly not a URL, or doesn't look like a URL, treat as raw text
const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://'));
if (!isSourceUrl) {
// Treat as raw text - create a simple memory node
const node = await addNode({
kind: 'memory',
title: title || 'Ingested Content',
content: source,
tags: ['ingested', 'text', ...(tags || [])],
metadata: { source: { type: 'text', ingestedAt: Date.now() } },
});
return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] };
}
const result = await ingest(source, { title, tags });
return { content: [{ type: 'text' as const, text: serialize(result) }] };
}
);
server.tool(
'memory_clip',
'Quick clip a URL into memory',
{
url: z.string().describe('URL to clip'),
title: z.string().optional().describe('Override title'),
tags: z.array(z.string()).optional().describe('Tags to apply'),
},
async ({ url, title, tags }) => {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true };
}
const result = await ingest(url, { title, tags });
return { content: [{ type: 'text' as const, text: serialize(result) }] };
}
);
// --- memory_index ---
import { indexProject } from '../core/indexer';