Add URL and content ingestion (Milestone 6)

- Add URL fetching with HTML-to-text extraction - Add basic PDF text extraction - Add smart content chunking with overlap - Add deduplication via content checksums - Add auto-linking to semantically related nodes - Add CLI commands: ingest, clip - Add MCP tools: memory_ingest, memory_clip
2026-02-03 11:00:28 +01:00
parent 67b1e3b481
commit c65a5bb03a
6 changed files with 655 additions and 0 deletions
--- a/src/cli/commands/ingest.ts
+++ b/src/cli/commands/ingest.ts
@@ -0,0 +1,95 @@
+import { Command } from 'commander';
+import chalk from 'chalk';
+import { ingest } from '../../core/ingest';
+
+export const ingestCommand = new Command('ingest')
+  .description('Ingest content from URLs, files, or stdin into the knowledge graph')
+  .argument('[source]', 'URL or file path to ingest')
+  .option('-t, --title <title>', 'Override title')
+  .option('--tags <tags>', 'Tags to apply (comma-separated)')
+  .option('--stdin', 'Read content from stdin')
+  .option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)')
+  .option('--no-link', 'Skip auto-linking to related nodes')
+  .action(async (source: string | undefined, opts) => {
+    try {
+      if (!source && !opts.stdin) {
+        console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
+        process.exit(1);
+      }
+
+      if (opts.stdin) {
+        console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
+      } else {
+        console.log(chalk.cyan(`Ingesting: ${source}`));
+      }
+
+      const result = await ingest(source || '', {
+        title: opts.title,
+        tags: opts.tags?.split(',').map((t: string) => t.trim()),
+        stdin: opts.stdin,
+        noLink: !opts.link,
+        chunkStrategy: opts.chunkSize ? {
+          maxTokens: parseInt(opts.chunkSize),
+        } : undefined,
+      });
+
+      if (!result.success) {
+        console.log(chalk.yellow('Content already exists (duplicate checksum)'));
+        return;
+      }
+
+      console.log();
+      console.log(chalk.green(`✓ Ingested: ${result.title}`));
+      console.log();
+      console.log(`  Type:     ${result.sourceType}`);
+      console.log(`  Nodes:    ${result.nodeCount}`);
+
+      if (result.parentId) {
+        console.log(`  Parent:   ${result.parentId.slice(0, 8)}`);
+      }
+
+      for (const node of result.nodes.slice(0, 5)) {
+        console.log(chalk.dim(`  - ${node.id.slice(0, 8)} ${node.title}`));
+      }
+
+      if (result.nodes.length > 5) {
+        console.log(chalk.dim(`  ... and ${result.nodes.length - 5} more`));
+      }
+    } catch (err: any) {
+      console.error(chalk.red(`Error: ${err.message}`));
+      process.exit(1);
+    }
+  });
+
+// Alias for quick URL clipping
+export const clipCommand = new Command('clip')
+  .description('Quick clip a URL (alias for ingest)')
+  .argument('<url>', 'URL to clip')
+  .option('-t, --title <title>', 'Override title')
+  .option('--tags <tags>', 'Tags to apply (comma-separated)')
+  .action(async (url: string, opts) => {
+    try {
+      if (!url.startsWith('http://') && !url.startsWith('https://')) {
+        console.error(chalk.red('Error: clip expects a URL'));
+        process.exit(1);
+      }
+
+      console.log(chalk.cyan(`Clipping: ${url}`));
+
+      const result = await ingest(url, {
+        title: opts.title,
+        tags: opts.tags?.split(',').map((t: string) => t.trim()),
+      });
+
+      if (!result.success) {
+        console.log(chalk.yellow('Already clipped (duplicate)'));
+        return;
+      }
+
+      console.log(chalk.green(`✓ ${result.title}`));
+      console.log(chalk.dim(`  ${result.nodes[0].id.slice(0, 8)}`));
+    } catch (err: any) {
+      console.error(chalk.red(`Error: ${err.message}`));
+      process.exit(1);
+    }
+  });
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca
 import { contextCommand, contextHookCommand } from './commands/context';
 import { indexCommand } from './commands/index-cmd';
 import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
+import { ingestCommand, clipCommand } from './commands/ingest';
 import { closeDb } from '../core/db';

 const program = new Command();
@@ -50,6 +51,8 @@ program.addCommand(indexCommand);
 program.addCommand(journalCommand);
 program.addCommand(journalAliasCommand);
 program.addCommand(quickCaptureCommand);
+program.addCommand(ingestCommand);
+program.addCommand(clipCommand);

 program.hook('postAction', () => {
  closeDb();
--- a/src/core/ingest/chunker.ts
+++ b/src/core/ingest/chunker.ts
@@ -0,0 +1,93 @@
+export interface ChunkOptions {
+  maxTokens?: number;
+  overlap?: number;
+  splitOn?: 'paragraph' | 'sentence' | 'heading';
+}
+
+export interface Chunk {
+  index: number;
+  content: string;
+  tokenEstimate: number;
+}
+
+const DEFAULT_MAX_TOKENS = 1000;
+const DEFAULT_OVERLAP = 100;
+const CHARS_PER_TOKEN = 4; // Rough estimate
+
+export function chunkContent(content: string, options: ChunkOptions = {}): Chunk[] {
+  const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
+  const overlap = options.overlap || DEFAULT_OVERLAP;
+  const splitOn = options.splitOn || 'paragraph';
+
+  const maxChars = maxTokens * CHARS_PER_TOKEN;
+  const overlapChars = overlap * CHARS_PER_TOKEN;
+
+  // If content is small enough, return as single chunk
+  if (content.length <= maxChars) {
+    return [{
+      index: 0,
+      content,
+      tokenEstimate: Math.ceil(content.length / CHARS_PER_TOKEN),
+    }];
+  }
+
+  // Split into segments based on strategy
+  const segments = splitContent(content, splitOn);
+
+  // Combine segments into chunks
+  const chunks: Chunk[] = [];
+  let currentChunk = '';
+  let chunkIndex = 0;
+
+  for (const segment of segments) {
+    if (currentChunk.length + segment.length > maxChars && currentChunk.length > 0) {
+      // Save current chunk
+      chunks.push({
+        index: chunkIndex++,
+        content: currentChunk.trim(),
+        tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
+      });
+
+      // Start new chunk with overlap
+      if (overlapChars > 0 && currentChunk.length > overlapChars) {
+        currentChunk = currentChunk.slice(-overlapChars) + segment;
+      } else {
+        currentChunk = segment;
+      }
+    } else {
+      currentChunk += segment;
+    }
+  }
+
+  // Don't forget the last chunk
+  if (currentChunk.trim()) {
+    chunks.push({
+      index: chunkIndex,
+      content: currentChunk.trim(),
+      tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
+    });
+  }
+
+  return chunks;
+}
+
+function splitContent(content: string, strategy: 'paragraph' | 'sentence' | 'heading'): string[] {
+  switch (strategy) {
+    case 'heading':
+      // Split on markdown headings
+      return content.split(/(?=^#{1,6}\s)/m).filter(s => s.trim());
+
+    case 'sentence':
+      // Split on sentence boundaries
+      return content.split(/(?<=[.!?])\s+/).filter(s => s.trim());
+
+    case 'paragraph':
+    default:
+      // Split on double newlines (paragraphs)
+      return content.split(/\n\n+/).filter(s => s.trim()).map(s => s + '\n\n');
+  }
+}
+
+export function estimateTokens(text: string): number {
+  return Math.ceil(text.length / CHARS_PER_TOKEN);
+}
--- a/src/core/ingest/fetchers.ts
+++ b/src/core/ingest/fetchers.ts
@@ -0,0 +1,252 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import * as https from 'https';
+import * as http from 'http';
+
+export type SourceType = 'url' | 'pdf' | 'markdown' | 'text' | 'html';
+
+export interface FetchedContent {
+  title: string;
+  content: string;
+  sourceType: SourceType;
+  metadata: Record<string, any>;
+}
+
+export function detectSourceType(source: string): SourceType {
+  if (source.startsWith('http://') || source.startsWith('https://')) {
+    return 'url';
+  }
+  const ext = path.extname(source).toLowerCase();
+  switch (ext) {
+    case '.pdf': return 'pdf';
+    case '.md': return 'markdown';
+    case '.html': case '.htm': return 'html';
+    default: return 'text';
+  }
+}
+
+export async function fetchContent(source: string, sourceType: SourceType): Promise<FetchedContent> {
+  switch (sourceType) {
+    case 'url':
+      return fetchUrl(source);
+    case 'pdf':
+      return fetchPdf(source);
+    case 'markdown':
+    case 'text':
+    case 'html':
+      return fetchFile(source, sourceType);
+    default:
+      throw new Error(`Unsupported source type: ${sourceType}`);
+  }
+}
+
+async function fetchUrl(url: string): Promise<FetchedContent> {
+  const html = await httpGet(url);
+
+  // Extract title from HTML
+  const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
+  const title = titleMatch?.[1]?.trim() || new URL(url).hostname;
+
+  // Convert HTML to readable text
+  const content = htmlToText(html);
+
+  return {
+    title,
+    content,
+    sourceType: 'url',
+    metadata: {
+      url,
+      fetchedAt: Date.now(),
+    },
+  };
+}
+
+function httpGet(url: string): Promise<string> {
+  return new Promise((resolve, reject) => {
+    const client = url.startsWith('https') ? https : http;
+    const req = client.get(url, {
+      headers: {
+        'User-Agent': 'Mozilla/5.0 (compatible; Cortex/1.0)',
+        'Accept': 'text/html,application/xhtml+xml,text/plain,*/*',
+      },
+      timeout: 30000,
+    }, (res) => {
+      // Handle redirects
+      if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+        const redirectUrl = new URL(res.headers.location, url).toString();
+        resolve(httpGet(redirectUrl));
+        return;
+      }
+
+      if (res.statusCode && res.statusCode >= 400) {
+        reject(new Error(`HTTP ${res.statusCode}`));
+        return;
+      }
+
+      let data = '';
+      res.on('data', chunk => data += chunk);
+      res.on('end', () => resolve(data));
+    });
+    req.on('error', reject);
+    req.on('timeout', () => {
+      req.destroy();
+      reject(new Error('Request timeout'));
+    });
+  });
+}
+
+function htmlToText(html: string): string {
+  // Remove scripts, styles, and other non-content elements
+  let text = html
+    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
+    .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
+    .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
+    .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
+    .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
+    .replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '');
+
+  // Try to find main content
+  const mainMatch = text.match(/<main[^>]*>([\s\S]*?)<\/main>/i) ||
+                    text.match(/<article[^>]*>([\s\S]*?)<\/article>/i) ||
+                    text.match(/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
+
+  if (mainMatch) {
+    text = mainMatch[1];
+  } else {
+    // Fall back to body
+    const bodyMatch = text.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+    if (bodyMatch) text = bodyMatch[1];
+  }
+
+  // Convert common elements to markdown-ish format
+  text = text
+    .replace(/<h1[^>]*>/gi, '\n# ')
+    .replace(/<\/h1>/gi, '\n')
+    .replace(/<h2[^>]*>/gi, '\n## ')
+    .replace(/<\/h2>/gi, '\n')
+    .replace(/<h3[^>]*>/gi, '\n### ')
+    .replace(/<\/h3>/gi, '\n')
+    .replace(/<h[456][^>]*>/gi, '\n#### ')
+    .replace(/<\/h[456]>/gi, '\n')
+    .replace(/<p[^>]*>/gi, '\n')
+    .replace(/<\/p>/gi, '\n')
+    .replace(/<br\s*\/?>/gi, '\n')
+    .replace(/<li[^>]*>/gi, '\n- ')
+    .replace(/<\/li>/gi, '')
+    .replace(/<[^>]+>/g, ' ')  // Remove remaining tags
+    .replace(/&nbsp;/g, ' ')
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/\n{3,}/g, '\n\n')  // Collapse multiple newlines
+    .replace(/[ \t]+/g, ' ')     // Collapse spaces
+    .trim();
+
+  return text;
+}
+
+async function fetchPdf(filePath: string): Promise<FetchedContent> {
+  // Basic PDF text extraction (very simple - just looks for text streams)
+  // For production, you'd want pdf-parse or pdfjs-dist
+  const buffer = fs.readFileSync(filePath);
+  const content = extractPdfText(buffer);
+
+  return {
+    title: path.basename(filePath, '.pdf'),
+    content: content || '[PDF content could not be extracted. Install pdf-parse for better support.]',
+    sourceType: 'pdf',
+    metadata: {
+      filePath,
+      size: buffer.length,
+      fetchedAt: Date.now(),
+    },
+  };
+}
+
+function extractPdfText(buffer: Buffer): string {
+  // Very basic PDF text extraction
+  // Looks for text between BT and ET markers, handles some encoding
+  const str = buffer.toString('binary');
+  const texts: string[] = [];
+
+  // Find text objects
+  const textRegex = /BT[\s\S]*?ET/g;
+  let match;
+  while ((match = textRegex.exec(str)) !== null) {
+    const block = match[0];
+    // Extract text from Tj and TJ operators
+    const tjMatches = block.match(/\(([^)]*)\)\s*Tj/g) || [];
+    for (const tj of tjMatches) {
+      const textMatch = tj.match(/\(([^)]*)\)/);
+      if (textMatch) {
+        texts.push(textMatch[1]);
+      }
+    }
+  }
+
+  // Also try to find raw text streams
+  const streamRegex = /stream\s*([\s\S]*?)\s*endstream/g;
+  while ((match = streamRegex.exec(str)) !== null) {
+    const decoded = match[1].replace(/[^\x20-\x7E\n\r\t]/g, ' ').trim();
+    if (decoded.length > 50 && /[a-zA-Z]{3,}/.test(decoded)) {
+      texts.push(decoded);
+    }
+  }
+
+  return texts.join('\n').replace(/\s+/g, ' ').trim();
+}
+
+async function fetchFile(filePath: string, sourceType: SourceType): Promise<FetchedContent> {
+  const content = fs.readFileSync(filePath, 'utf-8');
+  const title = path.basename(filePath, path.extname(filePath));
+
+  let processedContent = content;
+  if (sourceType === 'html') {
+    processedContent = htmlToText(content);
+  }
+
+  return {
+    title,
+    content: processedContent,
+    sourceType,
+    metadata: {
+      filePath,
+      fetchedAt: Date.now(),
+    },
+  };
+}
+
+export async function fetchFromClipboard(): Promise<FetchedContent> {
+  // This is platform-specific and won't work in all environments
+  // For standalone builds, we'd need native bindings
+  throw new Error('Clipboard support requires platform-specific implementation');
+}
+
+export async function fetchFromStdin(): Promise<FetchedContent> {
+  return new Promise((resolve, reject) => {
+    let data = '';
+    process.stdin.setEncoding('utf8');
+    process.stdin.on('data', chunk => data += chunk);
+    process.stdin.on('end', () => {
+      resolve({
+        title: 'Stdin Input',
+        content: data.trim(),
+        sourceType: 'text',
+        metadata: {
+          source: 'stdin',
+          fetchedAt: Date.now(),
+        },
+      });
+    });
+    process.stdin.on('error', reject);
+
+    // Timeout for stdin
+    setTimeout(() => {
+      if (!data) {
+        reject(new Error('No input received from stdin'));
+      }
+    }, 5000);
+  });
+}
--- a/src/core/ingest/index.ts
+++ b/src/core/ingest/index.ts
@@ -0,0 +1,162 @@
+import * as crypto from 'crypto';
+import { addNode, addEdge, listNodes, query } from '../store';
+import { Node } from '../../types';
+import { detectSourceType, fetchContent, fetchFromStdin, FetchedContent, SourceType } from './fetchers';
+import { chunkContent, ChunkOptions, Chunk } from './chunker';
+
+export interface IngestOptions {
+  title?: string;
+  tags?: string[];
+  chunkStrategy?: ChunkOptions;
+  noLink?: boolean;
+  stdin?: boolean;
+}
+
+export interface IngestResult {
+  success: boolean;
+  sourceType: SourceType;
+  title: string;
+  nodeCount: number;
+  nodes: { id: string; title: string }[];
+  parentId?: string;
+}
+
+export async function ingest(source: string, options: IngestOptions = {}): Promise<IngestResult> {
+  // Fetch content
+  let fetched: FetchedContent;
+
+  if (options.stdin) {
+    fetched = await fetchFromStdin();
+  } else {
+    const sourceType = detectSourceType(source);
+    fetched = await fetchContent(source, sourceType);
+  }
+
+  const title = options.title || fetched.title;
+  const tags = options.tags || [];
+
+  // Compute checksum for deduplication
+  const checksum = crypto.createHash('md5').update(fetched.content).digest('hex');
+
+  // Check for duplicates
+  const existingDupe = listNodes({ kind: 'memory', tags: ['ingested'] })
+    .find(n => (n.metadata as any)?.source?.checksum === checksum);
+
+  if (existingDupe) {
+    return {
+      success: false,
+      sourceType: fetched.sourceType,
+      title,
+      nodeCount: 0,
+      nodes: [],
+    };
+  }
+
+  // Chunk content if needed
+  const chunks = chunkContent(fetched.content, options.chunkStrategy);
+  const nodes: Node[] = [];
+
+  if (chunks.length === 1) {
+    // Single node
+    const node = await addNode({
+      kind: 'memory',
+      title,
+      content: chunks[0].content,
+      tags: ['ingested', fetched.sourceType, ...tags],
+      metadata: {
+        source: {
+          type: fetched.sourceType,
+          ...fetched.metadata,
+          checksum,
+        },
+        tokenEstimate: chunks[0].tokenEstimate,
+      },
+    });
+    nodes.push(node);
+  } else {
+    // Create parent node with summary
+    const summaryContent = `Ingested content from ${fetched.sourceType} source.\n\n` +
+      `**Source:** ${fetched.metadata.url || fetched.metadata.filePath || 'stdin'}\n` +
+      `**Chunks:** ${chunks.length}\n` +
+      `**Total tokens:** ~${chunks.reduce((sum, c) => sum + c.tokenEstimate, 0)}\n\n` +
+      `## Preview\n\n${fetched.content.slice(0, 500)}...`;
+
+    const parentNode = await addNode({
+      kind: 'memory',
+      title,
+      content: summaryContent,
+      tags: ['ingested', fetched.sourceType, 'parent', ...tags],
+      metadata: {
+        source: {
+          type: fetched.sourceType,
+          ...fetched.metadata,
+          checksum,
+        },
+        chunkCount: chunks.length,
+      },
+    });
+    nodes.push(parentNode);
+
+    // Create child nodes for each chunk
+    for (const chunk of chunks) {
+      const chunkNode = await addNode({
+        kind: 'memory',
+        title: `${title} (Part ${chunk.index + 1}/${chunks.length})`,
+        content: chunk.content,
+        tags: ['ingested', fetched.sourceType, 'chunk', ...tags],
+        metadata: {
+          parentId: parentNode.id,
+          chunkIndex: chunk.index,
+          tokenEstimate: chunk.tokenEstimate,
+        },
+      });
+      nodes.push(chunkNode);
+      addEdge(parentNode.id, chunkNode.id, 'contains');
+    }
+  }
+
+  // Auto-link to related nodes (if not disabled)
+  if (!options.noLink) {
+    for (const node of nodes) {
+      await linkRelatedNodes(node);
+    }
+  }
+
+  return {
+    success: true,
+    sourceType: fetched.sourceType,
+    title,
+    nodeCount: nodes.length,
+    nodes: nodes.map(n => ({ id: n.id, title: n.title })),
+    parentId: chunks.length > 1 ? nodes[0].id : undefined,
+  };
+}
+
+async function linkRelatedNodes(node: Node): Promise<void> {
+  // Search for related nodes based on content
+  const searchText = node.title + ' ' + node.content.slice(0, 500);
+
+  try {
+    const related = await query(searchText, { limit: 5 });
+
+    for (const result of related) {
+      // Don't link to self or siblings from same ingestion
+      if (result.node.id === node.id) continue;
+      if ((result.node.metadata as any)?.source?.checksum === (node.metadata as any)?.source?.checksum) continue;
+
+      // Only link if relevance is high enough
+      if (result.score > 0.5) {
+        try {
+          addEdge(node.id, result.node.id, 'relates_to', { reason: 'semantic-similarity', score: result.score });
+        } catch {
+          // Edge might already exist
+        }
+      }
+    }
+  } catch {
+    // Search might fail, that's okay
+  }
+}
+
+export { detectSourceType, fetchContent, fetchFromStdin, SourceType } from './fetchers';
+export { chunkContent, estimateTokens, ChunkOptions, Chunk } from './chunker';
--- a/src/mcp/index.ts
+++ b/src/mcp/index.ts
@@ -650,6 +650,56 @@ server.tool(
  }
 );

+// --- memory_ingest ---
+import { ingest } from '../core/ingest';
+
+server.tool(
+  'memory_ingest',
+  'Ingest content from a URL or text into the knowledge graph',
+  {
+    source: z.string().describe('URL or raw text to ingest'),
+    title: z.string().optional().describe('Override title'),
+    tags: z.array(z.string()).optional().describe('Tags to apply'),
+    isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'),
+  },
+  async ({ source, title, tags, isUrl }) => {
+    // If explicitly not a URL, or doesn't look like a URL, treat as raw text
+    const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://'));
+
+    if (!isSourceUrl) {
+      // Treat as raw text - create a simple memory node
+      const node = await addNode({
+        kind: 'memory',
+        title: title || 'Ingested Content',
+        content: source,
+        tags: ['ingested', 'text', ...(tags || [])],
+        metadata: { source: { type: 'text', ingestedAt: Date.now() } },
+      });
+      return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] };
+    }
+
+    const result = await ingest(source, { title, tags });
+    return { content: [{ type: 'text' as const, text: serialize(result) }] };
+  }
+);
+
+server.tool(
+  'memory_clip',
+  'Quick clip a URL into memory',
+  {
+    url: z.string().describe('URL to clip'),
+    title: z.string().optional().describe('Override title'),
+    tags: z.array(z.string()).optional().describe('Tags to apply'),
+  },
+  async ({ url, title, tags }) => {
+    if (!url.startsWith('http://') && !url.startsWith('https://')) {
+      return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true };
+    }
+    const result = await ingest(url, { title, tags });
+    return { content: [{ type: 'text' as const, text: serialize(result) }] };
+  }
+);
+
 // --- memory_index ---
 import { indexProject } from '../core/indexer';