Add URL and content ingestion (Milestone 6)

- Add URL fetching with HTML-to-text extraction - Add basic PDF text extraction - Add smart content chunking with overlap - Add deduplication via content checksums - Add auto-linking to semantically related nodes - Add CLI commands: ingest, clip - Add MCP tools: memory_ingest, memory_clip
2026-02-03 11:00:28 +01:00
parent 67b1e3b481
commit c65a5bb03a
6 changed files with 655 additions and 0 deletions
--- a/src/cli/commands/ingest.ts
+++ b/src/cli/commands/ingest.ts
@@ -0,0 +1,95 @@
 import { Command } from 'commander';
 import chalk from 'chalk';
 import { ingest } from '../../core/ingest';
 export const ingestCommand = new Command('ingest')
  .description('Ingest content from URLs, files, or stdin into the knowledge graph')
  .argument('[source]', 'URL or file path to ingest')
  .option('-t, --title <title>', 'Override title')
  .option('--tags <tags>', 'Tags to apply (comma-separated)')
  .option('--stdin', 'Read content from stdin')
  .option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)')
  .option('--no-link', 'Skip auto-linking to related nodes')
  .action(async (source: string | undefined, opts) => {
    try {
      if (!source && !opts.stdin) {
        console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
        process.exit(1);
      }
      if (opts.stdin) {
        console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
      } else {
        console.log(chalk.cyan(`Ingesting: ${source}`));
      }
      const result = await ingest(source || '', {
        title: opts.title,
        tags: opts.tags?.split(',').map((t: string) => t.trim()),
        stdin: opts.stdin,
        noLink: !opts.link,
        chunkStrategy: opts.chunkSize ? {
          maxTokens: parseInt(opts.chunkSize),
        } : undefined,
      });
      if (!result.success) {
        console.log(chalk.yellow('Content already exists (duplicate checksum)'));
        return;
      }
      console.log();
      console.log(chalk.green(`✓ Ingested: ${result.title}`));
      console.log();
      console.log(`  Type:     ${result.sourceType}`);
      console.log(`  Nodes:    ${result.nodeCount}`);
      if (result.parentId) {
        console.log(`  Parent:   ${result.parentId.slice(0, 8)}`);
      }
      for (const node of result.nodes.slice(0, 5)) {
        console.log(chalk.dim(`  - ${node.id.slice(0, 8)} ${node.title}`));
      }
      if (result.nodes.length > 5) {
        console.log(chalk.dim(`  ... and ${result.nodes.length - 5} more`));
      }
    } catch (err: any) {
      console.error(chalk.red(`Error: ${err.message}`));
      process.exit(1);
    }
  });
 // Alias for quick URL clipping
 export const clipCommand = new Command('clip')
  .description('Quick clip a URL (alias for ingest)')
  .argument('<url>', 'URL to clip')
  .option('-t, --title <title>', 'Override title')
  .option('--tags <tags>', 'Tags to apply (comma-separated)')
  .action(async (url: string, opts) => {
    try {
      if (!url.startsWith('http://') && !url.startsWith('https://')) {
        console.error(chalk.red('Error: clip expects a URL'));
        process.exit(1);
      }
      console.log(chalk.cyan(`Clipping: ${url}`));
      const result = await ingest(url, {
        title: opts.title,
        tags: opts.tags?.split(',').map((t: string) => t.trim()),
      });
      if (!result.success) {
        console.log(chalk.yellow('Already clipped (duplicate)'));
        return;
      }
      console.log(chalk.green(`✓ ${result.title}`));
      console.log(chalk.dim(`  ${result.nodes[0].id.slice(0, 8)}`));
    } catch (err: any) {
      console.error(chalk.red(`Error: ${err.message}`));
      process.exit(1);
    }
  });
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca
 import { contextCommand, contextHookCommand } from './commands/context';
 import { indexCommand } from './commands/index-cmd';
 import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
 import { ingestCommand, clipCommand } from './commands/ingest';
 import { closeDb } from '../core/db';
 const program = new Command();
@@ -50,6 +51,8 @@ program.addCommand(indexCommand);
 program.addCommand(journalCommand);
 program.addCommand(journalAliasCommand);
 program.addCommand(quickCaptureCommand);
 program.addCommand(ingestCommand);
 program.addCommand(clipCommand);
 program.hook('postAction', () => {
  closeDb();
--- a/src/core/ingest/chunker.ts
+++ b/src/core/ingest/chunker.ts
@@ -0,0 +1,93 @@
 export interface ChunkOptions {
  maxTokens?: number;
  overlap?: number;
  splitOn?: 'paragraph' | 'sentence' | 'heading';
 }
 export interface Chunk {
  index: number;
  content: string;
  tokenEstimate: number;
 }
 const DEFAULT_MAX_TOKENS = 1000;
 const DEFAULT_OVERLAP = 100;
 const CHARS_PER_TOKEN = 4; // Rough estimate
 export function chunkContent(content: string, options: ChunkOptions = {}): Chunk[] {
  const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
  const overlap = options.overlap || DEFAULT_OVERLAP;
  const splitOn = options.splitOn || 'paragraph';
  const maxChars = maxTokens * CHARS_PER_TOKEN;
  const overlapChars = overlap * CHARS_PER_TOKEN;
  // If content is small enough, return as single chunk
  if (content.length <= maxChars) {
    return [{
      index: 0,
      content,
      tokenEstimate: Math.ceil(content.length / CHARS_PER_TOKEN),
    }];
  }
  // Split into segments based on strategy
  const segments = splitContent(content, splitOn);
  // Combine segments into chunks
  const chunks: Chunk[] = [];
  let currentChunk = '';
  let chunkIndex = 0;
  for (const segment of segments) {
    if (currentChunk.length + segment.length > maxChars && currentChunk.length > 0) {
      // Save current chunk
      chunks.push({
        index: chunkIndex++,
        content: currentChunk.trim(),
        tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
      });
      // Start new chunk with overlap
      if (overlapChars > 0 && currentChunk.length > overlapChars) {
        currentChunk = currentChunk.slice(-overlapChars) + segment;
      } else {
        currentChunk = segment;
      }
    } else {
      currentChunk += segment;
    }
  }
  // Don't forget the last chunk
  if (currentChunk.trim()) {
    chunks.push({
      index: chunkIndex,
      content: currentChunk.trim(),
      tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
    });
  }
  return chunks;
 }
 function splitContent(content: string, strategy: 'paragraph' | 'sentence' | 'heading'): string[] {
  switch (strategy) {
    case 'heading':
      // Split on markdown headings
      return content.split(/(?=^#{1,6}\s)/m).filter(s => s.trim());
    case 'sentence':
      // Split on sentence boundaries
      return content.split(/(?<=[.!?])\s+/).filter(s => s.trim());
    case 'paragraph':
    default:
      // Split on double newlines (paragraphs)
      return content.split(/\n\n+/).filter(s => s.trim()).map(s => s + '\n\n');
  }
 }
 export function estimateTokens(text: string): number {
  return Math.ceil(text.length / CHARS_PER_TOKEN);
 }
--- a/src/core/ingest/fetchers.ts
+++ b/src/core/ingest/fetchers.ts
@@ -0,0 +1,252 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import * as https from 'https';
 import * as http from 'http';
 export type SourceType = 'url' | 'pdf' | 'markdown' | 'text' | 'html';
 export interface FetchedContent {
  title: string;
  content: string;
  sourceType: SourceType;
  metadata: Record<string, any>;
 }
 export function detectSourceType(source: string): SourceType {
  if (source.startsWith('http://') || source.startsWith('https://')) {
    return 'url';
  }
  const ext = path.extname(source).toLowerCase();
  switch (ext) {
    case '.pdf': return 'pdf';
    case '.md': return 'markdown';
    case '.html': case '.htm': return 'html';
    default: return 'text';
  }
 }
 export async function fetchContent(source: string, sourceType: SourceType): Promise<FetchedContent> {
  switch (sourceType) {
    case 'url':
      return fetchUrl(source);
    case 'pdf':
      return fetchPdf(source);
    case 'markdown':
    case 'text':
    case 'html':
      return fetchFile(source, sourceType);
    default:
      throw new Error(`Unsupported source type: ${sourceType}`);
  }
 }
 async function fetchUrl(url: string): Promise<FetchedContent> {
  const html = await httpGet(url);
  // Extract title from HTML
  const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
  const title = titleMatch?.[1]?.trim() || new URL(url).hostname;
  // Convert HTML to readable text
  const content = htmlToText(html);
  return {
    title,
    content,
    sourceType: 'url',
    metadata: {
      url,
      fetchedAt: Date.now(),
    },
  };
 }
 function httpGet(url: string): Promise<string> {
  return new Promise((resolve, reject) => {
    const client = url.startsWith('https') ? https : http;
    const req = client.get(url, {
      headers: {
        'User-Agent': 'Mozilla/5.0 (compatible; Cortex/1.0)',
        'Accept': 'text/html,application/xhtml+xml,text/plain,*/*',
      },
      timeout: 30000,
    }, (res) => {
      // Handle redirects
      if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
        const redirectUrl = new URL(res.headers.location, url).toString();
        resolve(httpGet(redirectUrl));
        return;
      }
      if (res.statusCode && res.statusCode >= 400) {
        reject(new Error(`HTTP ${res.statusCode}`));
        return;
      }
      let data = '';
      res.on('data', chunk => data += chunk);
      res.on('end', () => resolve(data));
    });
    req.on('error', reject);
    req.on('timeout', () => {
      req.destroy();
      reject(new Error('Request timeout'));
    });
  });
 }
 function htmlToText(html: string): string {
  // Remove scripts, styles, and other non-content elements
  let text = html
    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
    .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
    .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
    .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
    .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
    .replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '');
  // Try to find main content
  const mainMatch = text.match(/<main[^>]*>([\s\S]*?)<\/main>/i) ||
                    text.match(/<article[^>]*>([\s\S]*?)<\/article>/i) ||
                    text.match(/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
  if (mainMatch) {
    text = mainMatch[1];
  } else {
    // Fall back to body
    const bodyMatch = text.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
    if (bodyMatch) text = bodyMatch[1];
  }
  // Convert common elements to markdown-ish format
  text = text
    .replace(/<h1[^>]*>/gi, '\n# ')
    .replace(/<\/h1>/gi, '\n')
    .replace(/<h2[^>]*>/gi, '\n## ')
    .replace(/<\/h2>/gi, '\n')
    .replace(/<h3[^>]*>/gi, '\n### ')
    .replace(/<\/h3>/gi, '\n')
    .replace(/<h[456][^>]*>/gi, '\n#### ')
    .replace(/<\/h[456]>/gi, '\n')
    .replace(/<p[^>]*>/gi, '\n')
    .replace(/<\/p>/gi, '\n')
    .replace(/<br\s*\/?>/gi, '\n')
    .replace(/<li[^>]*>/gi, '\n- ')
    .replace(/<\/li>/gi, '')
    .replace(/<[^>]+>/g, ' ')  // Remove remaining tags
    .replace(/&nbsp;/g, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/\n{3,}/g, '\n\n')  // Collapse multiple newlines
    .replace(/[ \t]+/g, ' ')     // Collapse spaces
    .trim();
  return text;
 }
 async function fetchPdf(filePath: string): Promise<FetchedContent> {
  // Basic PDF text extraction (very simple - just looks for text streams)
  // For production, you'd want pdf-parse or pdfjs-dist
  const buffer = fs.readFileSync(filePath);
  const content = extractPdfText(buffer);
  return {
    title: path.basename(filePath, '.pdf'),
    content: content || '[PDF content could not be extracted. Install pdf-parse for better support.]',
    sourceType: 'pdf',
    metadata: {
      filePath,
      size: buffer.length,
      fetchedAt: Date.now(),
    },
  };
 }
 function extractPdfText(buffer: Buffer): string {
  // Very basic PDF text extraction
  // Looks for text between BT and ET markers, handles some encoding
  const str = buffer.toString('binary');
  const texts: string[] = [];
  // Find text objects
  const textRegex = /BT[\s\S]*?ET/g;
  let match;
  while ((match = textRegex.exec(str)) !== null) {
    const block = match[0];
    // Extract text from Tj and TJ operators
    const tjMatches = block.match(/\(([^)]*)\)\s*Tj/g) || [];
    for (const tj of tjMatches) {
      const textMatch = tj.match(/\(([^)]*)\)/);
      if (textMatch) {
        texts.push(textMatch[1]);
      }
    }
  }
  // Also try to find raw text streams
  const streamRegex = /stream\s*([\s\S]*?)\s*endstream/g;
  while ((match = streamRegex.exec(str)) !== null) {
    const decoded = match[1].replace(/[^\x20-\x7E\n\r\t]/g, ' ').trim();
    if (decoded.length > 50 && /[a-zA-Z]{3,}/.test(decoded)) {
      texts.push(decoded);
    }
  }
  return texts.join('\n').replace(/\s+/g, ' ').trim();
 }
 async function fetchFile(filePath: string, sourceType: SourceType): Promise<FetchedContent> {
  const content = fs.readFileSync(filePath, 'utf-8');
  const title = path.basename(filePath, path.extname(filePath));
  let processedContent = content;
  if (sourceType === 'html') {
    processedContent = htmlToText(content);
  }
  return {
    title,
    content: processedContent,
    sourceType,
    metadata: {
      filePath,
      fetchedAt: Date.now(),
    },
  };
 }
 export async function fetchFromClipboard(): Promise<FetchedContent> {
  // This is platform-specific and won't work in all environments
  // For standalone builds, we'd need native bindings
  throw new Error('Clipboard support requires platform-specific implementation');
 }
 export async function fetchFromStdin(): Promise<FetchedContent> {
  return new Promise((resolve, reject) => {
    let data = '';
    process.stdin.setEncoding('utf8');
    process.stdin.on('data', chunk => data += chunk);
    process.stdin.on('end', () => {
      resolve({
        title: 'Stdin Input',
        content: data.trim(),
        sourceType: 'text',
        metadata: {
          source: 'stdin',
          fetchedAt: Date.now(),
        },
      });
    });
    process.stdin.on('error', reject);
    // Timeout for stdin
    setTimeout(() => {
      if (!data) {
        reject(new Error('No input received from stdin'));
      }
    }, 5000);
  });
 }
--- a/src/core/ingest/index.ts
+++ b/src/core/ingest/index.ts
@@ -0,0 +1,162 @@
 import * as crypto from 'crypto';
 import { addNode, addEdge, listNodes, query } from '../store';
 import { Node } from '../../types';
 import { detectSourceType, fetchContent, fetchFromStdin, FetchedContent, SourceType } from './fetchers';
 import { chunkContent, ChunkOptions, Chunk } from './chunker';
 export interface IngestOptions {
  title?: string;
  tags?: string[];
  chunkStrategy?: ChunkOptions;
  noLink?: boolean;
  stdin?: boolean;
 }
 export interface IngestResult {
  success: boolean;
  sourceType: SourceType;
  title: string;
  nodeCount: number;
  nodes: { id: string; title: string }[];
  parentId?: string;
 }
 export async function ingest(source: string, options: IngestOptions = {}): Promise<IngestResult> {
  // Fetch content
  let fetched: FetchedContent;
  if (options.stdin) {
    fetched = await fetchFromStdin();
  } else {
    const sourceType = detectSourceType(source);
    fetched = await fetchContent(source, sourceType);
  }
  const title = options.title || fetched.title;
  const tags = options.tags || [];
  // Compute checksum for deduplication
  const checksum = crypto.createHash('md5').update(fetched.content).digest('hex');
  // Check for duplicates
  const existingDupe = listNodes({ kind: 'memory', tags: ['ingested'] })
    .find(n => (n.metadata as any)?.source?.checksum === checksum);
  if (existingDupe) {
    return {
      success: false,
      sourceType: fetched.sourceType,
      title,
      nodeCount: 0,
      nodes: [],
    };
  }
  // Chunk content if needed
  const chunks = chunkContent(fetched.content, options.chunkStrategy);
  const nodes: Node[] = [];
  if (chunks.length === 1) {
    // Single node
    const node = await addNode({
      kind: 'memory',
      title,
      content: chunks[0].content,
      tags: ['ingested', fetched.sourceType, ...tags],
      metadata: {
        source: {
          type: fetched.sourceType,
          ...fetched.metadata,
          checksum,
        },
        tokenEstimate: chunks[0].tokenEstimate,
      },
    });
    nodes.push(node);
  } else {
    // Create parent node with summary
    const summaryContent = `Ingested content from ${fetched.sourceType} source.\n\n` +
      `**Source:** ${fetched.metadata.url || fetched.metadata.filePath || 'stdin'}\n` +
      `**Chunks:** ${chunks.length}\n` +
      `**Total tokens:** ~${chunks.reduce((sum, c) => sum + c.tokenEstimate, 0)}\n\n` +
      `## Preview\n\n${fetched.content.slice(0, 500)}...`;
    const parentNode = await addNode({
      kind: 'memory',
      title,
      content: summaryContent,
      tags: ['ingested', fetched.sourceType, 'parent', ...tags],
      metadata: {
        source: {
          type: fetched.sourceType,
          ...fetched.metadata,
          checksum,
        },
        chunkCount: chunks.length,
      },
    });
    nodes.push(parentNode);
    // Create child nodes for each chunk
    for (const chunk of chunks) {
      const chunkNode = await addNode({
        kind: 'memory',
        title: `${title} (Part ${chunk.index + 1}/${chunks.length})`,
        content: chunk.content,
        tags: ['ingested', fetched.sourceType, 'chunk', ...tags],
        metadata: {
          parentId: parentNode.id,
          chunkIndex: chunk.index,
          tokenEstimate: chunk.tokenEstimate,
        },
      });
      nodes.push(chunkNode);
      addEdge(parentNode.id, chunkNode.id, 'contains');
    }
  }
  // Auto-link to related nodes (if not disabled)
  if (!options.noLink) {
    for (const node of nodes) {
      await linkRelatedNodes(node);
    }
  }
  return {
    success: true,
    sourceType: fetched.sourceType,
    title,
    nodeCount: nodes.length,
    nodes: nodes.map(n => ({ id: n.id, title: n.title })),
    parentId: chunks.length > 1 ? nodes[0].id : undefined,
  };
 }
 async function linkRelatedNodes(node: Node): Promise<void> {
  // Search for related nodes based on content
  const searchText = node.title + ' ' + node.content.slice(0, 500);
  try {
    const related = await query(searchText, { limit: 5 });
    for (const result of related) {
      // Don't link to self or siblings from same ingestion
      if (result.node.id === node.id) continue;
      if ((result.node.metadata as any)?.source?.checksum === (node.metadata as any)?.source?.checksum) continue;
      // Only link if relevance is high enough
      if (result.score > 0.5) {
        try {
          addEdge(node.id, result.node.id, 'relates_to', { reason: 'semantic-similarity', score: result.score });
        } catch {
          // Edge might already exist
        }
      }
    }
  } catch {
    // Search might fail, that's okay
  }
 }
 export { detectSourceType, fetchContent, fetchFromStdin, SourceType } from './fetchers';
 export { chunkContent, estimateTokens, ChunkOptions, Chunk } from './chunker';
--- a/src/mcp/index.ts
+++ b/src/mcp/index.ts
@@ -650,6 +650,56 @@ server.tool(
  }
 );
 // --- memory_ingest ---
 import { ingest } from '../core/ingest';
 server.tool(
  'memory_ingest',
  'Ingest content from a URL or text into the knowledge graph',
  {
    source: z.string().describe('URL or raw text to ingest'),
    title: z.string().optional().describe('Override title'),
    tags: z.array(z.string()).optional().describe('Tags to apply'),
    isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'),
  },
  async ({ source, title, tags, isUrl }) => {
    // If explicitly not a URL, or doesn't look like a URL, treat as raw text
    const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://'));
    if (!isSourceUrl) {
      // Treat as raw text - create a simple memory node
      const node = await addNode({
        kind: 'memory',
        title: title || 'Ingested Content',
        content: source,
        tags: ['ingested', 'text', ...(tags || [])],
        metadata: { source: { type: 'text', ingestedAt: Date.now() } },
      });
      return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] };
    }
    const result = await ingest(source, { title, tags });
    return { content: [{ type: 'text' as const, text: serialize(result) }] };
  }
 );
 server.tool(
  'memory_clip',
  'Quick clip a URL into memory',
  {
    url: z.string().describe('URL to clip'),
    title: z.string().optional().describe('Override title'),
    tags: z.array(z.string()).optional().describe('Tags to apply'),
  },
  async ({ url, title, tags }) => {
    if (!url.startsWith('http://') && !url.startsWith('https://')) {
      return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true };
    }
    const result = await ingest(url, { title, tags });
    return { content: [{ type: 'text' as const, text: serialize(result) }] };
  }
 );
 // --- memory_index ---
 import { indexProject } from '../core/indexer';