Add URL and content ingestion (Milestone 6)
- Add URL fetching with HTML-to-text extraction - Add basic PDF text extraction - Add smart content chunking with overlap - Add deduplication via content checksums - Add auto-linking to semantically related nodes - Add CLI commands: ingest, clip - Add MCP tools: memory_ingest, memory_clip
This commit is contained in:
95
src/cli/commands/ingest.ts
Normal file
95
src/cli/commands/ingest.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import { Command } from 'commander';
|
||||
import chalk from 'chalk';
|
||||
import { ingest } from '../../core/ingest';
|
||||
|
||||
export const ingestCommand = new Command('ingest')
|
||||
.description('Ingest content from URLs, files, or stdin into the knowledge graph')
|
||||
.argument('[source]', 'URL or file path to ingest')
|
||||
.option('-t, --title <title>', 'Override title')
|
||||
.option('--tags <tags>', 'Tags to apply (comma-separated)')
|
||||
.option('--stdin', 'Read content from stdin')
|
||||
.option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)')
|
||||
.option('--no-link', 'Skip auto-linking to related nodes')
|
||||
.action(async (source: string | undefined, opts) => {
|
||||
try {
|
||||
if (!source && !opts.stdin) {
|
||||
console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (opts.stdin) {
|
||||
console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
|
||||
} else {
|
||||
console.log(chalk.cyan(`Ingesting: ${source}`));
|
||||
}
|
||||
|
||||
const result = await ingest(source || '', {
|
||||
title: opts.title,
|
||||
tags: opts.tags?.split(',').map((t: string) => t.trim()),
|
||||
stdin: opts.stdin,
|
||||
noLink: !opts.link,
|
||||
chunkStrategy: opts.chunkSize ? {
|
||||
maxTokens: parseInt(opts.chunkSize),
|
||||
} : undefined,
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.log(chalk.yellow('Content already exists (duplicate checksum)'));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log();
|
||||
console.log(chalk.green(`✓ Ingested: ${result.title}`));
|
||||
console.log();
|
||||
console.log(` Type: ${result.sourceType}`);
|
||||
console.log(` Nodes: ${result.nodeCount}`);
|
||||
|
||||
if (result.parentId) {
|
||||
console.log(` Parent: ${result.parentId.slice(0, 8)}`);
|
||||
}
|
||||
|
||||
for (const node of result.nodes.slice(0, 5)) {
|
||||
console.log(chalk.dim(` - ${node.id.slice(0, 8)} ${node.title}`));
|
||||
}
|
||||
|
||||
if (result.nodes.length > 5) {
|
||||
console.log(chalk.dim(` ... and ${result.nodes.length - 5} more`));
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.error(chalk.red(`Error: ${err.message}`));
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
|
||||
// Alias for quick URL clipping
|
||||
export const clipCommand = new Command('clip')
|
||||
.description('Quick clip a URL (alias for ingest)')
|
||||
.argument('<url>', 'URL to clip')
|
||||
.option('-t, --title <title>', 'Override title')
|
||||
.option('--tags <tags>', 'Tags to apply (comma-separated)')
|
||||
.action(async (url: string, opts) => {
|
||||
try {
|
||||
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||
console.error(chalk.red('Error: clip expects a URL'));
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(chalk.cyan(`Clipping: ${url}`));
|
||||
|
||||
const result = await ingest(url, {
|
||||
title: opts.title,
|
||||
tags: opts.tags?.split(',').map((t: string) => t.trim()),
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
console.log(chalk.yellow('Already clipped (duplicate)'));
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(chalk.green(`✓ ${result.title}`));
|
||||
console.log(chalk.dim(` ${result.nodes[0].id.slice(0, 8)}`));
|
||||
} catch (err: any) {
|
||||
console.error(chalk.red(`Error: ${err.message}`));
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
@@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca
|
||||
import { contextCommand, contextHookCommand } from './commands/context';
|
||||
import { indexCommand } from './commands/index-cmd';
|
||||
import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
|
||||
import { ingestCommand, clipCommand } from './commands/ingest';
|
||||
import { closeDb } from '../core/db';
|
||||
|
||||
const program = new Command();
|
||||
@@ -50,6 +51,8 @@ program.addCommand(indexCommand);
|
||||
program.addCommand(journalCommand);
|
||||
program.addCommand(journalAliasCommand);
|
||||
program.addCommand(quickCaptureCommand);
|
||||
program.addCommand(ingestCommand);
|
||||
program.addCommand(clipCommand);
|
||||
|
||||
program.hook('postAction', () => {
|
||||
closeDb();
|
||||
|
||||
Reference in New Issue
Block a user