diff --git a/src/cli/commands/ingest.ts b/src/cli/commands/ingest.ts new file mode 100644 index 0000000..3a7b8b5 --- /dev/null +++ b/src/cli/commands/ingest.ts @@ -0,0 +1,95 @@ +import { Command } from 'commander'; +import chalk from 'chalk'; +import { ingest } from '../../core/ingest'; + +export const ingestCommand = new Command('ingest') + .description('Ingest content from URLs, files, or stdin into the knowledge graph') + .argument('[source]', 'URL or file path to ingest') + .option('-t, --title ', 'Override title') + .option('--tags <tags>', 'Tags to apply (comma-separated)') + .option('--stdin', 'Read content from stdin') + .option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)') + .option('--no-link', 'Skip auto-linking to related nodes') + .action(async (source: string | undefined, opts) => { + try { + if (!source && !opts.stdin) { + console.error(chalk.red('Error: Provide a source URL/file or use --stdin')); + process.exit(1); + } + + if (opts.stdin) { + console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)')); + } else { + console.log(chalk.cyan(`Ingesting: ${source}`)); + } + + const result = await ingest(source || '', { + title: opts.title, + tags: opts.tags?.split(',').map((t: string) => t.trim()), + stdin: opts.stdin, + noLink: !opts.link, + chunkStrategy: opts.chunkSize ? { + maxTokens: parseInt(opts.chunkSize), + } : undefined, + }); + + if (!result.success) { + console.log(chalk.yellow('Content already exists (duplicate checksum)')); + return; + } + + console.log(); + console.log(chalk.green(`✓ Ingested: ${result.title}`)); + console.log(); + console.log(` Type: ${result.sourceType}`); + console.log(` Nodes: ${result.nodeCount}`); + + if (result.parentId) { + console.log(` Parent: ${result.parentId.slice(0, 8)}`); + } + + for (const node of result.nodes.slice(0, 5)) { + console.log(chalk.dim(` - ${node.id.slice(0, 8)} ${node.title}`)); + } + + if (result.nodes.length > 5) { + console.log(chalk.dim(` ... and ${result.nodes.length - 5} more`)); + } + } catch (err: any) { + console.error(chalk.red(`Error: ${err.message}`)); + process.exit(1); + } + }); + +// Alias for quick URL clipping +export const clipCommand = new Command('clip') + .description('Quick clip a URL (alias for ingest)') + .argument('<url>', 'URL to clip') + .option('-t, --title <title>', 'Override title') + .option('--tags <tags>', 'Tags to apply (comma-separated)') + .action(async (url: string, opts) => { + try { + if (!url.startsWith('http://') && !url.startsWith('https://')) { + console.error(chalk.red('Error: clip expects a URL')); + process.exit(1); + } + + console.log(chalk.cyan(`Clipping: ${url}`)); + + const result = await ingest(url, { + title: opts.title, + tags: opts.tags?.split(',').map((t: string) => t.trim()), + }); + + if (!result.success) { + console.log(chalk.yellow('Already clipped (duplicate)')); + return; + } + + console.log(chalk.green(`✓ ${result.title}`)); + console.log(chalk.dim(` ${result.nodes[0].id.slice(0, 8)}`)); + } catch (err: any) { + console.error(chalk.red(`Error: ${err.message}`)); + process.exit(1); + } + }); diff --git a/src/cli/index.ts b/src/cli/index.ts index a802482..b1c46dd 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca import { contextCommand, contextHookCommand } from './commands/context'; import { indexCommand } from './commands/index-cmd'; import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal'; +import { ingestCommand, clipCommand } from './commands/ingest'; import { closeDb } from '../core/db'; const program = new Command(); @@ -50,6 +51,8 @@ program.addCommand(indexCommand); program.addCommand(journalCommand); program.addCommand(journalAliasCommand); program.addCommand(quickCaptureCommand); +program.addCommand(ingestCommand); +program.addCommand(clipCommand); program.hook('postAction', () => { closeDb(); diff --git a/src/core/ingest/chunker.ts b/src/core/ingest/chunker.ts new file mode 100644 index 0000000..a211753 --- /dev/null +++ b/src/core/ingest/chunker.ts @@ -0,0 +1,93 @@ +export interface ChunkOptions { + maxTokens?: number; + overlap?: number; + splitOn?: 'paragraph' | 'sentence' | 'heading'; +} + +export interface Chunk { + index: number; + content: string; + tokenEstimate: number; +} + +const DEFAULT_MAX_TOKENS = 1000; +const DEFAULT_OVERLAP = 100; +const CHARS_PER_TOKEN = 4; // Rough estimate + +export function chunkContent(content: string, options: ChunkOptions = {}): Chunk[] { + const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS; + const overlap = options.overlap || DEFAULT_OVERLAP; + const splitOn = options.splitOn || 'paragraph'; + + const maxChars = maxTokens * CHARS_PER_TOKEN; + const overlapChars = overlap * CHARS_PER_TOKEN; + + // If content is small enough, return as single chunk + if (content.length <= maxChars) { + return [{ + index: 0, + content, + tokenEstimate: Math.ceil(content.length / CHARS_PER_TOKEN), + }]; + } + + // Split into segments based on strategy + const segments = splitContent(content, splitOn); + + // Combine segments into chunks + const chunks: Chunk[] = []; + let currentChunk = ''; + let chunkIndex = 0; + + for (const segment of segments) { + if (currentChunk.length + segment.length > maxChars && currentChunk.length > 0) { + // Save current chunk + chunks.push({ + index: chunkIndex++, + content: currentChunk.trim(), + tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN), + }); + + // Start new chunk with overlap + if (overlapChars > 0 && currentChunk.length > overlapChars) { + currentChunk = currentChunk.slice(-overlapChars) + segment; + } else { + currentChunk = segment; + } + } else { + currentChunk += segment; + } + } + + // Don't forget the last chunk + if (currentChunk.trim()) { + chunks.push({ + index: chunkIndex, + content: currentChunk.trim(), + tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN), + }); + } + + return chunks; +} + +function splitContent(content: string, strategy: 'paragraph' | 'sentence' | 'heading'): string[] { + switch (strategy) { + case 'heading': + // Split on markdown headings + return content.split(/(?=^#{1,6}\s)/m).filter(s => s.trim()); + + case 'sentence': + // Split on sentence boundaries + return content.split(/(?<=[.!?])\s+/).filter(s => s.trim()); + + case 'paragraph': + default: + // Split on double newlines (paragraphs) + return content.split(/\n\n+/).filter(s => s.trim()).map(s => s + '\n\n'); + } +} + +export function estimateTokens(text: string): number { + return Math.ceil(text.length / CHARS_PER_TOKEN); +} diff --git a/src/core/ingest/fetchers.ts b/src/core/ingest/fetchers.ts new file mode 100644 index 0000000..4d838ff --- /dev/null +++ b/src/core/ingest/fetchers.ts @@ -0,0 +1,252 @@ +import * as fs from 'fs'; +import * as path from 'path'; +import * as https from 'https'; +import * as http from 'http'; + +export type SourceType = 'url' | 'pdf' | 'markdown' | 'text' | 'html'; + +export interface FetchedContent { + title: string; + content: string; + sourceType: SourceType; + metadata: Record<string, any>; +} + +export function detectSourceType(source: string): SourceType { + if (source.startsWith('http://') || source.startsWith('https://')) { + return 'url'; + } + const ext = path.extname(source).toLowerCase(); + switch (ext) { + case '.pdf': return 'pdf'; + case '.md': return 'markdown'; + case '.html': case '.htm': return 'html'; + default: return 'text'; + } +} + +export async function fetchContent(source: string, sourceType: SourceType): Promise<FetchedContent> { + switch (sourceType) { + case 'url': + return fetchUrl(source); + case 'pdf': + return fetchPdf(source); + case 'markdown': + case 'text': + case 'html': + return fetchFile(source, sourceType); + default: + throw new Error(`Unsupported source type: ${sourceType}`); + } +} + +async function fetchUrl(url: string): Promise<FetchedContent> { + const html = await httpGet(url); + + // Extract title from HTML + const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i); + const title = titleMatch?.[1]?.trim() || new URL(url).hostname; + + // Convert HTML to readable text + const content = htmlToText(html); + + return { + title, + content, + sourceType: 'url', + metadata: { + url, + fetchedAt: Date.now(), + }, + }; +} + +function httpGet(url: string): Promise<string> { + return new Promise((resolve, reject) => { + const client = url.startsWith('https') ? https : http; + const req = client.get(url, { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; Cortex/1.0)', + 'Accept': 'text/html,application/xhtml+xml,text/plain,*/*', + }, + timeout: 30000, + }, (res) => { + // Handle redirects + if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { + const redirectUrl = new URL(res.headers.location, url).toString(); + resolve(httpGet(redirectUrl)); + return; + } + + if (res.statusCode && res.statusCode >= 400) { + reject(new Error(`HTTP ${res.statusCode}`)); + return; + } + + let data = ''; + res.on('data', chunk => data += chunk); + res.on('end', () => resolve(data)); + }); + req.on('error', reject); + req.on('timeout', () => { + req.destroy(); + reject(new Error('Request timeout')); + }); + }); +} + +function htmlToText(html: string): string { + // Remove scripts, styles, and other non-content elements + let text = html + .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') + .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') + .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '') + .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '') + .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '') + .replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, ''); + + // Try to find main content + const mainMatch = text.match(/<main[^>]*>([\s\S]*?)<\/main>/i) || + text.match(/<article[^>]*>([\s\S]*?)<\/article>/i) || + text.match(/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i); + + if (mainMatch) { + text = mainMatch[1]; + } else { + // Fall back to body + const bodyMatch = text.match(/<body[^>]*>([\s\S]*?)<\/body>/i); + if (bodyMatch) text = bodyMatch[1]; + } + + // Convert common elements to markdown-ish format + text = text + .replace(/<h1[^>]*>/gi, '\n# ') + .replace(/<\/h1>/gi, '\n') + .replace(/<h2[^>]*>/gi, '\n## ') + .replace(/<\/h2>/gi, '\n') + .replace(/<h3[^>]*>/gi, '\n### ') + .replace(/<\/h3>/gi, '\n') + .replace(/<h[456][^>]*>/gi, '\n#### ') + .replace(/<\/h[456]>/gi, '\n') + .replace(/<p[^>]*>/gi, '\n') + .replace(/<\/p>/gi, '\n') + .replace(/<br\s*\/?>/gi, '\n') + .replace(/<li[^>]*>/gi, '\n- ') + .replace(/<\/li>/gi, '') + .replace(/<[^>]+>/g, ' ') // Remove remaining tags + .replace(/ /g, ' ') + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/\n{3,}/g, '\n\n') // Collapse multiple newlines + .replace(/[ \t]+/g, ' ') // Collapse spaces + .trim(); + + return text; +} + +async function fetchPdf(filePath: string): Promise<FetchedContent> { + // Basic PDF text extraction (very simple - just looks for text streams) + // For production, you'd want pdf-parse or pdfjs-dist + const buffer = fs.readFileSync(filePath); + const content = extractPdfText(buffer); + + return { + title: path.basename(filePath, '.pdf'), + content: content || '[PDF content could not be extracted. Install pdf-parse for better support.]', + sourceType: 'pdf', + metadata: { + filePath, + size: buffer.length, + fetchedAt: Date.now(), + }, + }; +} + +function extractPdfText(buffer: Buffer): string { + // Very basic PDF text extraction + // Looks for text between BT and ET markers, handles some encoding + const str = buffer.toString('binary'); + const texts: string[] = []; + + // Find text objects + const textRegex = /BT[\s\S]*?ET/g; + let match; + while ((match = textRegex.exec(str)) !== null) { + const block = match[0]; + // Extract text from Tj and TJ operators + const tjMatches = block.match(/\(([^)]*)\)\s*Tj/g) || []; + for (const tj of tjMatches) { + const textMatch = tj.match(/\(([^)]*)\)/); + if (textMatch) { + texts.push(textMatch[1]); + } + } + } + + // Also try to find raw text streams + const streamRegex = /stream\s*([\s\S]*?)\s*endstream/g; + while ((match = streamRegex.exec(str)) !== null) { + const decoded = match[1].replace(/[^\x20-\x7E\n\r\t]/g, ' ').trim(); + if (decoded.length > 50 && /[a-zA-Z]{3,}/.test(decoded)) { + texts.push(decoded); + } + } + + return texts.join('\n').replace(/\s+/g, ' ').trim(); +} + +async function fetchFile(filePath: string, sourceType: SourceType): Promise<FetchedContent> { + const content = fs.readFileSync(filePath, 'utf-8'); + const title = path.basename(filePath, path.extname(filePath)); + + let processedContent = content; + if (sourceType === 'html') { + processedContent = htmlToText(content); + } + + return { + title, + content: processedContent, + sourceType, + metadata: { + filePath, + fetchedAt: Date.now(), + }, + }; +} + +export async function fetchFromClipboard(): Promise<FetchedContent> { + // This is platform-specific and won't work in all environments + // For standalone builds, we'd need native bindings + throw new Error('Clipboard support requires platform-specific implementation'); +} + +export async function fetchFromStdin(): Promise<FetchedContent> { + return new Promise((resolve, reject) => { + let data = ''; + process.stdin.setEncoding('utf8'); + process.stdin.on('data', chunk => data += chunk); + process.stdin.on('end', () => { + resolve({ + title: 'Stdin Input', + content: data.trim(), + sourceType: 'text', + metadata: { + source: 'stdin', + fetchedAt: Date.now(), + }, + }); + }); + process.stdin.on('error', reject); + + // Timeout for stdin + setTimeout(() => { + if (!data) { + reject(new Error('No input received from stdin')); + } + }, 5000); + }); +} diff --git a/src/core/ingest/index.ts b/src/core/ingest/index.ts new file mode 100644 index 0000000..da0c28d --- /dev/null +++ b/src/core/ingest/index.ts @@ -0,0 +1,162 @@ +import * as crypto from 'crypto'; +import { addNode, addEdge, listNodes, query } from '../store'; +import { Node } from '../../types'; +import { detectSourceType, fetchContent, fetchFromStdin, FetchedContent, SourceType } from './fetchers'; +import { chunkContent, ChunkOptions, Chunk } from './chunker'; + +export interface IngestOptions { + title?: string; + tags?: string[]; + chunkStrategy?: ChunkOptions; + noLink?: boolean; + stdin?: boolean; +} + +export interface IngestResult { + success: boolean; + sourceType: SourceType; + title: string; + nodeCount: number; + nodes: { id: string; title: string }[]; + parentId?: string; +} + +export async function ingest(source: string, options: IngestOptions = {}): Promise<IngestResult> { + // Fetch content + let fetched: FetchedContent; + + if (options.stdin) { + fetched = await fetchFromStdin(); + } else { + const sourceType = detectSourceType(source); + fetched = await fetchContent(source, sourceType); + } + + const title = options.title || fetched.title; + const tags = options.tags || []; + + // Compute checksum for deduplication + const checksum = crypto.createHash('md5').update(fetched.content).digest('hex'); + + // Check for duplicates + const existingDupe = listNodes({ kind: 'memory', tags: ['ingested'] }) + .find(n => (n.metadata as any)?.source?.checksum === checksum); + + if (existingDupe) { + return { + success: false, + sourceType: fetched.sourceType, + title, + nodeCount: 0, + nodes: [], + }; + } + + // Chunk content if needed + const chunks = chunkContent(fetched.content, options.chunkStrategy); + const nodes: Node[] = []; + + if (chunks.length === 1) { + // Single node + const node = await addNode({ + kind: 'memory', + title, + content: chunks[0].content, + tags: ['ingested', fetched.sourceType, ...tags], + metadata: { + source: { + type: fetched.sourceType, + ...fetched.metadata, + checksum, + }, + tokenEstimate: chunks[0].tokenEstimate, + }, + }); + nodes.push(node); + } else { + // Create parent node with summary + const summaryContent = `Ingested content from ${fetched.sourceType} source.\n\n` + + `**Source:** ${fetched.metadata.url || fetched.metadata.filePath || 'stdin'}\n` + + `**Chunks:** ${chunks.length}\n` + + `**Total tokens:** ~${chunks.reduce((sum, c) => sum + c.tokenEstimate, 0)}\n\n` + + `## Preview\n\n${fetched.content.slice(0, 500)}...`; + + const parentNode = await addNode({ + kind: 'memory', + title, + content: summaryContent, + tags: ['ingested', fetched.sourceType, 'parent', ...tags], + metadata: { + source: { + type: fetched.sourceType, + ...fetched.metadata, + checksum, + }, + chunkCount: chunks.length, + }, + }); + nodes.push(parentNode); + + // Create child nodes for each chunk + for (const chunk of chunks) { + const chunkNode = await addNode({ + kind: 'memory', + title: `${title} (Part ${chunk.index + 1}/${chunks.length})`, + content: chunk.content, + tags: ['ingested', fetched.sourceType, 'chunk', ...tags], + metadata: { + parentId: parentNode.id, + chunkIndex: chunk.index, + tokenEstimate: chunk.tokenEstimate, + }, + }); + nodes.push(chunkNode); + addEdge(parentNode.id, chunkNode.id, 'contains'); + } + } + + // Auto-link to related nodes (if not disabled) + if (!options.noLink) { + for (const node of nodes) { + await linkRelatedNodes(node); + } + } + + return { + success: true, + sourceType: fetched.sourceType, + title, + nodeCount: nodes.length, + nodes: nodes.map(n => ({ id: n.id, title: n.title })), + parentId: chunks.length > 1 ? nodes[0].id : undefined, + }; +} + +async function linkRelatedNodes(node: Node): Promise<void> { + // Search for related nodes based on content + const searchText = node.title + ' ' + node.content.slice(0, 500); + + try { + const related = await query(searchText, { limit: 5 }); + + for (const result of related) { + // Don't link to self or siblings from same ingestion + if (result.node.id === node.id) continue; + if ((result.node.metadata as any)?.source?.checksum === (node.metadata as any)?.source?.checksum) continue; + + // Only link if relevance is high enough + if (result.score > 0.5) { + try { + addEdge(node.id, result.node.id, 'relates_to', { reason: 'semantic-similarity', score: result.score }); + } catch { + // Edge might already exist + } + } + } + } catch { + // Search might fail, that's okay + } +} + +export { detectSourceType, fetchContent, fetchFromStdin, SourceType } from './fetchers'; +export { chunkContent, estimateTokens, ChunkOptions, Chunk } from './chunker'; diff --git a/src/mcp/index.ts b/src/mcp/index.ts index b1d2600..9e6a844 100644 --- a/src/mcp/index.ts +++ b/src/mcp/index.ts @@ -650,6 +650,56 @@ server.tool( } ); +// --- memory_ingest --- +import { ingest } from '../core/ingest'; + +server.tool( + 'memory_ingest', + 'Ingest content from a URL or text into the knowledge graph', + { + source: z.string().describe('URL or raw text to ingest'), + title: z.string().optional().describe('Override title'), + tags: z.array(z.string()).optional().describe('Tags to apply'), + isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'), + }, + async ({ source, title, tags, isUrl }) => { + // If explicitly not a URL, or doesn't look like a URL, treat as raw text + const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://')); + + if (!isSourceUrl) { + // Treat as raw text - create a simple memory node + const node = await addNode({ + kind: 'memory', + title: title || 'Ingested Content', + content: source, + tags: ['ingested', 'text', ...(tags || [])], + metadata: { source: { type: 'text', ingestedAt: Date.now() } }, + }); + return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] }; + } + + const result = await ingest(source, { title, tags }); + return { content: [{ type: 'text' as const, text: serialize(result) }] }; + } +); + +server.tool( + 'memory_clip', + 'Quick clip a URL into memory', + { + url: z.string().describe('URL to clip'), + title: z.string().optional().describe('Override title'), + tags: z.array(z.string()).optional().describe('Tags to apply'), + }, + async ({ url, title, tags }) => { + if (!url.startsWith('http://') && !url.startsWith('https://')) { + return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true }; + } + const result = await ingest(url, { title, tags }); + return { content: [{ type: 'text' as const, text: serialize(result) }] }; + } +); + // --- memory_index --- import { indexProject } from '../core/indexer';