Files
cortex/src/cli/commands/ingest.ts
omigamedev c65a5bb03a Add URL and content ingestion (Milestone 6)
- Add URL fetching with HTML-to-text extraction
- Add basic PDF text extraction
- Add smart content chunking with overlap
- Add deduplication via content checksums
- Add auto-linking to semantically related nodes
- Add CLI commands: ingest, clip
- Add MCP tools: memory_ingest, memory_clip
2026-02-03 11:00:28 +01:00

96 lines
3.1 KiB
TypeScript

import { Command } from 'commander';
import chalk from 'chalk';
import { ingest } from '../../core/ingest';
export const ingestCommand = new Command('ingest')
.description('Ingest content from URLs, files, or stdin into the knowledge graph')
.argument('[source]', 'URL or file path to ingest')
.option('-t, --title <title>', 'Override title')
.option('--tags <tags>', 'Tags to apply (comma-separated)')
.option('--stdin', 'Read content from stdin')
.option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)')
.option('--no-link', 'Skip auto-linking to related nodes')
.action(async (source: string | undefined, opts) => {
try {
if (!source && !opts.stdin) {
console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
process.exit(1);
}
if (opts.stdin) {
console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
} else {
console.log(chalk.cyan(`Ingesting: ${source}`));
}
const result = await ingest(source || '', {
title: opts.title,
tags: opts.tags?.split(',').map((t: string) => t.trim()),
stdin: opts.stdin,
noLink: !opts.link,
chunkStrategy: opts.chunkSize ? {
maxTokens: parseInt(opts.chunkSize),
} : undefined,
});
if (!result.success) {
console.log(chalk.yellow('Content already exists (duplicate checksum)'));
return;
}
console.log();
console.log(chalk.green(`✓ Ingested: ${result.title}`));
console.log();
console.log(` Type: ${result.sourceType}`);
console.log(` Nodes: ${result.nodeCount}`);
if (result.parentId) {
console.log(` Parent: ${result.parentId.slice(0, 8)}`);
}
for (const node of result.nodes.slice(0, 5)) {
console.log(chalk.dim(` - ${node.id.slice(0, 8)} ${node.title}`));
}
if (result.nodes.length > 5) {
console.log(chalk.dim(` ... and ${result.nodes.length - 5} more`));
}
} catch (err: any) {
console.error(chalk.red(`Error: ${err.message}`));
process.exit(1);
}
});
// Alias for quick URL clipping
export const clipCommand = new Command('clip')
.description('Quick clip a URL (alias for ingest)')
.argument('<url>', 'URL to clip')
.option('-t, --title <title>', 'Override title')
.option('--tags <tags>', 'Tags to apply (comma-separated)')
.action(async (url: string, opts) => {
try {
if (!url.startsWith('http://') && !url.startsWith('https://')) {
console.error(chalk.red('Error: clip expects a URL'));
process.exit(1);
}
console.log(chalk.cyan(`Clipping: ${url}`));
const result = await ingest(url, {
title: opts.title,
tags: opts.tags?.split(',').map((t: string) => t.trim()),
});
if (!result.success) {
console.log(chalk.yellow('Already clipped (duplicate)'));
return;
}
console.log(chalk.green(`${result.title}`));
console.log(chalk.dim(` ${result.nodes[0].id.slice(0, 8)}`));
} catch (err: any) {
console.error(chalk.red(`Error: ${err.message}`));
process.exit(1);
}
});