Add URL and content ingestion (Milestone 6)
- Add URL fetching with HTML-to-text extraction - Add basic PDF text extraction - Add smart content chunking with overlap - Add deduplication via content checksums - Add auto-linking to semantically related nodes - Add CLI commands: ingest, clip - Add MCP tools: memory_ingest, memory_clip
This commit is contained in:
95
src/cli/commands/ingest.ts
Normal file
95
src/cli/commands/ingest.ts
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
import { Command } from 'commander';
|
||||||
|
import chalk from 'chalk';
|
||||||
|
import { ingest } from '../../core/ingest';
|
||||||
|
|
||||||
|
export const ingestCommand = new Command('ingest')
|
||||||
|
.description('Ingest content from URLs, files, or stdin into the knowledge graph')
|
||||||
|
.argument('[source]', 'URL or file path to ingest')
|
||||||
|
.option('-t, --title <title>', 'Override title')
|
||||||
|
.option('--tags <tags>', 'Tags to apply (comma-separated)')
|
||||||
|
.option('--stdin', 'Read content from stdin')
|
||||||
|
.option('--chunk-size <n>', 'Max tokens per chunk (default: 1000)')
|
||||||
|
.option('--no-link', 'Skip auto-linking to related nodes')
|
||||||
|
.action(async (source: string | undefined, opts) => {
|
||||||
|
try {
|
||||||
|
if (!source && !opts.stdin) {
|
||||||
|
console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (opts.stdin) {
|
||||||
|
console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
|
||||||
|
} else {
|
||||||
|
console.log(chalk.cyan(`Ingesting: ${source}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await ingest(source || '', {
|
||||||
|
title: opts.title,
|
||||||
|
tags: opts.tags?.split(',').map((t: string) => t.trim()),
|
||||||
|
stdin: opts.stdin,
|
||||||
|
noLink: !opts.link,
|
||||||
|
chunkStrategy: opts.chunkSize ? {
|
||||||
|
maxTokens: parseInt(opts.chunkSize),
|
||||||
|
} : undefined,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!result.success) {
|
||||||
|
console.log(chalk.yellow('Content already exists (duplicate checksum)'));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log();
|
||||||
|
console.log(chalk.green(`✓ Ingested: ${result.title}`));
|
||||||
|
console.log();
|
||||||
|
console.log(` Type: ${result.sourceType}`);
|
||||||
|
console.log(` Nodes: ${result.nodeCount}`);
|
||||||
|
|
||||||
|
if (result.parentId) {
|
||||||
|
console.log(` Parent: ${result.parentId.slice(0, 8)}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const node of result.nodes.slice(0, 5)) {
|
||||||
|
console.log(chalk.dim(` - ${node.id.slice(0, 8)} ${node.title}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.nodes.length > 5) {
|
||||||
|
console.log(chalk.dim(` ... and ${result.nodes.length - 5} more`));
|
||||||
|
}
|
||||||
|
} catch (err: any) {
|
||||||
|
console.error(chalk.red(`Error: ${err.message}`));
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Alias for quick URL clipping
|
||||||
|
export const clipCommand = new Command('clip')
|
||||||
|
.description('Quick clip a URL (alias for ingest)')
|
||||||
|
.argument('<url>', 'URL to clip')
|
||||||
|
.option('-t, --title <title>', 'Override title')
|
||||||
|
.option('--tags <tags>', 'Tags to apply (comma-separated)')
|
||||||
|
.action(async (url: string, opts) => {
|
||||||
|
try {
|
||||||
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||||
|
console.error(chalk.red('Error: clip expects a URL'));
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(chalk.cyan(`Clipping: ${url}`));
|
||||||
|
|
||||||
|
const result = await ingest(url, {
|
||||||
|
title: opts.title,
|
||||||
|
tags: opts.tags?.split(',').map((t: string) => t.trim()),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!result.success) {
|
||||||
|
console.log(chalk.yellow('Already clipped (duplicate)'));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(chalk.green(`✓ ${result.title}`));
|
||||||
|
console.log(chalk.dim(` ${result.nodes[0].id.slice(0, 8)}`));
|
||||||
|
} catch (err: any) {
|
||||||
|
console.error(chalk.red(`Error: ${err.message}`));
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
});
|
||||||
@@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca
|
|||||||
import { contextCommand, contextHookCommand } from './commands/context';
|
import { contextCommand, contextHookCommand } from './commands/context';
|
||||||
import { indexCommand } from './commands/index-cmd';
|
import { indexCommand } from './commands/index-cmd';
|
||||||
import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
|
import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
|
||||||
|
import { ingestCommand, clipCommand } from './commands/ingest';
|
||||||
import { closeDb } from '../core/db';
|
import { closeDb } from '../core/db';
|
||||||
|
|
||||||
const program = new Command();
|
const program = new Command();
|
||||||
@@ -50,6 +51,8 @@ program.addCommand(indexCommand);
|
|||||||
program.addCommand(journalCommand);
|
program.addCommand(journalCommand);
|
||||||
program.addCommand(journalAliasCommand);
|
program.addCommand(journalAliasCommand);
|
||||||
program.addCommand(quickCaptureCommand);
|
program.addCommand(quickCaptureCommand);
|
||||||
|
program.addCommand(ingestCommand);
|
||||||
|
program.addCommand(clipCommand);
|
||||||
|
|
||||||
program.hook('postAction', () => {
|
program.hook('postAction', () => {
|
||||||
closeDb();
|
closeDb();
|
||||||
|
|||||||
93
src/core/ingest/chunker.ts
Normal file
93
src/core/ingest/chunker.ts
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
export interface ChunkOptions {
|
||||||
|
maxTokens?: number;
|
||||||
|
overlap?: number;
|
||||||
|
splitOn?: 'paragraph' | 'sentence' | 'heading';
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Chunk {
|
||||||
|
index: number;
|
||||||
|
content: string;
|
||||||
|
tokenEstimate: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
const DEFAULT_MAX_TOKENS = 1000;
|
||||||
|
const DEFAULT_OVERLAP = 100;
|
||||||
|
const CHARS_PER_TOKEN = 4; // Rough estimate
|
||||||
|
|
||||||
|
export function chunkContent(content: string, options: ChunkOptions = {}): Chunk[] {
|
||||||
|
const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
|
||||||
|
const overlap = options.overlap || DEFAULT_OVERLAP;
|
||||||
|
const splitOn = options.splitOn || 'paragraph';
|
||||||
|
|
||||||
|
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
||||||
|
const overlapChars = overlap * CHARS_PER_TOKEN;
|
||||||
|
|
||||||
|
// If content is small enough, return as single chunk
|
||||||
|
if (content.length <= maxChars) {
|
||||||
|
return [{
|
||||||
|
index: 0,
|
||||||
|
content,
|
||||||
|
tokenEstimate: Math.ceil(content.length / CHARS_PER_TOKEN),
|
||||||
|
}];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Split into segments based on strategy
|
||||||
|
const segments = splitContent(content, splitOn);
|
||||||
|
|
||||||
|
// Combine segments into chunks
|
||||||
|
const chunks: Chunk[] = [];
|
||||||
|
let currentChunk = '';
|
||||||
|
let chunkIndex = 0;
|
||||||
|
|
||||||
|
for (const segment of segments) {
|
||||||
|
if (currentChunk.length + segment.length > maxChars && currentChunk.length > 0) {
|
||||||
|
// Save current chunk
|
||||||
|
chunks.push({
|
||||||
|
index: chunkIndex++,
|
||||||
|
content: currentChunk.trim(),
|
||||||
|
tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
|
||||||
|
});
|
||||||
|
|
||||||
|
// Start new chunk with overlap
|
||||||
|
if (overlapChars > 0 && currentChunk.length > overlapChars) {
|
||||||
|
currentChunk = currentChunk.slice(-overlapChars) + segment;
|
||||||
|
} else {
|
||||||
|
currentChunk = segment;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
currentChunk += segment;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't forget the last chunk
|
||||||
|
if (currentChunk.trim()) {
|
||||||
|
chunks.push({
|
||||||
|
index: chunkIndex,
|
||||||
|
content: currentChunk.trim(),
|
||||||
|
tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks;
|
||||||
|
}
|
||||||
|
|
||||||
|
function splitContent(content: string, strategy: 'paragraph' | 'sentence' | 'heading'): string[] {
|
||||||
|
switch (strategy) {
|
||||||
|
case 'heading':
|
||||||
|
// Split on markdown headings
|
||||||
|
return content.split(/(?=^#{1,6}\s)/m).filter(s => s.trim());
|
||||||
|
|
||||||
|
case 'sentence':
|
||||||
|
// Split on sentence boundaries
|
||||||
|
return content.split(/(?<=[.!?])\s+/).filter(s => s.trim());
|
||||||
|
|
||||||
|
case 'paragraph':
|
||||||
|
default:
|
||||||
|
// Split on double newlines (paragraphs)
|
||||||
|
return content.split(/\n\n+/).filter(s => s.trim()).map(s => s + '\n\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export function estimateTokens(text: string): number {
|
||||||
|
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
||||||
|
}
|
||||||
252
src/core/ingest/fetchers.ts
Normal file
252
src/core/ingest/fetchers.ts
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as https from 'https';
|
||||||
|
import * as http from 'http';
|
||||||
|
|
||||||
|
export type SourceType = 'url' | 'pdf' | 'markdown' | 'text' | 'html';
|
||||||
|
|
||||||
|
export interface FetchedContent {
|
||||||
|
title: string;
|
||||||
|
content: string;
|
||||||
|
sourceType: SourceType;
|
||||||
|
metadata: Record<string, any>;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function detectSourceType(source: string): SourceType {
|
||||||
|
if (source.startsWith('http://') || source.startsWith('https://')) {
|
||||||
|
return 'url';
|
||||||
|
}
|
||||||
|
const ext = path.extname(source).toLowerCase();
|
||||||
|
switch (ext) {
|
||||||
|
case '.pdf': return 'pdf';
|
||||||
|
case '.md': return 'markdown';
|
||||||
|
case '.html': case '.htm': return 'html';
|
||||||
|
default: return 'text';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchContent(source: string, sourceType: SourceType): Promise<FetchedContent> {
|
||||||
|
switch (sourceType) {
|
||||||
|
case 'url':
|
||||||
|
return fetchUrl(source);
|
||||||
|
case 'pdf':
|
||||||
|
return fetchPdf(source);
|
||||||
|
case 'markdown':
|
||||||
|
case 'text':
|
||||||
|
case 'html':
|
||||||
|
return fetchFile(source, sourceType);
|
||||||
|
default:
|
||||||
|
throw new Error(`Unsupported source type: ${sourceType}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchUrl(url: string): Promise<FetchedContent> {
|
||||||
|
const html = await httpGet(url);
|
||||||
|
|
||||||
|
// Extract title from HTML
|
||||||
|
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||||
|
const title = titleMatch?.[1]?.trim() || new URL(url).hostname;
|
||||||
|
|
||||||
|
// Convert HTML to readable text
|
||||||
|
const content = htmlToText(html);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
content,
|
||||||
|
sourceType: 'url',
|
||||||
|
metadata: {
|
||||||
|
url,
|
||||||
|
fetchedAt: Date.now(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function httpGet(url: string): Promise<string> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const client = url.startsWith('https') ? https : http;
|
||||||
|
const req = client.get(url, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (compatible; Cortex/1.0)',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,text/plain,*/*',
|
||||||
|
},
|
||||||
|
timeout: 30000,
|
||||||
|
}, (res) => {
|
||||||
|
// Handle redirects
|
||||||
|
if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
|
||||||
|
const redirectUrl = new URL(res.headers.location, url).toString();
|
||||||
|
resolve(httpGet(redirectUrl));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (res.statusCode && res.statusCode >= 400) {
|
||||||
|
reject(new Error(`HTTP ${res.statusCode}`));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let data = '';
|
||||||
|
res.on('data', chunk => data += chunk);
|
||||||
|
res.on('end', () => resolve(data));
|
||||||
|
});
|
||||||
|
req.on('error', reject);
|
||||||
|
req.on('timeout', () => {
|
||||||
|
req.destroy();
|
||||||
|
reject(new Error('Request timeout'));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function htmlToText(html: string): string {
|
||||||
|
// Remove scripts, styles, and other non-content elements
|
||||||
|
let text = html
|
||||||
|
.replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
|
||||||
|
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
|
||||||
|
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
|
||||||
|
.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
|
||||||
|
.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
|
||||||
|
.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '');
|
||||||
|
|
||||||
|
// Try to find main content
|
||||||
|
const mainMatch = text.match(/<main[^>]*>([\s\S]*?)<\/main>/i) ||
|
||||||
|
text.match(/<article[^>]*>([\s\S]*?)<\/article>/i) ||
|
||||||
|
text.match(/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
|
||||||
|
|
||||||
|
if (mainMatch) {
|
||||||
|
text = mainMatch[1];
|
||||||
|
} else {
|
||||||
|
// Fall back to body
|
||||||
|
const bodyMatch = text.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
||||||
|
if (bodyMatch) text = bodyMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert common elements to markdown-ish format
|
||||||
|
text = text
|
||||||
|
.replace(/<h1[^>]*>/gi, '\n# ')
|
||||||
|
.replace(/<\/h1>/gi, '\n')
|
||||||
|
.replace(/<h2[^>]*>/gi, '\n## ')
|
||||||
|
.replace(/<\/h2>/gi, '\n')
|
||||||
|
.replace(/<h3[^>]*>/gi, '\n### ')
|
||||||
|
.replace(/<\/h3>/gi, '\n')
|
||||||
|
.replace(/<h[456][^>]*>/gi, '\n#### ')
|
||||||
|
.replace(/<\/h[456]>/gi, '\n')
|
||||||
|
.replace(/<p[^>]*>/gi, '\n')
|
||||||
|
.replace(/<\/p>/gi, '\n')
|
||||||
|
.replace(/<br\s*\/?>/gi, '\n')
|
||||||
|
.replace(/<li[^>]*>/gi, '\n- ')
|
||||||
|
.replace(/<\/li>/gi, '')
|
||||||
|
.replace(/<[^>]+>/g, ' ') // Remove remaining tags
|
||||||
|
.replace(/ /g, ' ')
|
||||||
|
.replace(/&/g, '&')
|
||||||
|
.replace(/</g, '<')
|
||||||
|
.replace(/>/g, '>')
|
||||||
|
.replace(/"/g, '"')
|
||||||
|
.replace(/'/g, "'")
|
||||||
|
.replace(/\n{3,}/g, '\n\n') // Collapse multiple newlines
|
||||||
|
.replace(/[ \t]+/g, ' ') // Collapse spaces
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
return text;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchPdf(filePath: string): Promise<FetchedContent> {
|
||||||
|
// Basic PDF text extraction (very simple - just looks for text streams)
|
||||||
|
// For production, you'd want pdf-parse or pdfjs-dist
|
||||||
|
const buffer = fs.readFileSync(filePath);
|
||||||
|
const content = extractPdfText(buffer);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: path.basename(filePath, '.pdf'),
|
||||||
|
content: content || '[PDF content could not be extracted. Install pdf-parse for better support.]',
|
||||||
|
sourceType: 'pdf',
|
||||||
|
metadata: {
|
||||||
|
filePath,
|
||||||
|
size: buffer.length,
|
||||||
|
fetchedAt: Date.now(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractPdfText(buffer: Buffer): string {
|
||||||
|
// Very basic PDF text extraction
|
||||||
|
// Looks for text between BT and ET markers, handles some encoding
|
||||||
|
const str = buffer.toString('binary');
|
||||||
|
const texts: string[] = [];
|
||||||
|
|
||||||
|
// Find text objects
|
||||||
|
const textRegex = /BT[\s\S]*?ET/g;
|
||||||
|
let match;
|
||||||
|
while ((match = textRegex.exec(str)) !== null) {
|
||||||
|
const block = match[0];
|
||||||
|
// Extract text from Tj and TJ operators
|
||||||
|
const tjMatches = block.match(/\(([^)]*)\)\s*Tj/g) || [];
|
||||||
|
for (const tj of tjMatches) {
|
||||||
|
const textMatch = tj.match(/\(([^)]*)\)/);
|
||||||
|
if (textMatch) {
|
||||||
|
texts.push(textMatch[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Also try to find raw text streams
|
||||||
|
const streamRegex = /stream\s*([\s\S]*?)\s*endstream/g;
|
||||||
|
while ((match = streamRegex.exec(str)) !== null) {
|
||||||
|
const decoded = match[1].replace(/[^\x20-\x7E\n\r\t]/g, ' ').trim();
|
||||||
|
if (decoded.length > 50 && /[a-zA-Z]{3,}/.test(decoded)) {
|
||||||
|
texts.push(decoded);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return texts.join('\n').replace(/\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function fetchFile(filePath: string, sourceType: SourceType): Promise<FetchedContent> {
|
||||||
|
const content = fs.readFileSync(filePath, 'utf-8');
|
||||||
|
const title = path.basename(filePath, path.extname(filePath));
|
||||||
|
|
||||||
|
let processedContent = content;
|
||||||
|
if (sourceType === 'html') {
|
||||||
|
processedContent = htmlToText(content);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
title,
|
||||||
|
content: processedContent,
|
||||||
|
sourceType,
|
||||||
|
metadata: {
|
||||||
|
filePath,
|
||||||
|
fetchedAt: Date.now(),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchFromClipboard(): Promise<FetchedContent> {
|
||||||
|
// This is platform-specific and won't work in all environments
|
||||||
|
// For standalone builds, we'd need native bindings
|
||||||
|
throw new Error('Clipboard support requires platform-specific implementation');
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function fetchFromStdin(): Promise<FetchedContent> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
let data = '';
|
||||||
|
process.stdin.setEncoding('utf8');
|
||||||
|
process.stdin.on('data', chunk => data += chunk);
|
||||||
|
process.stdin.on('end', () => {
|
||||||
|
resolve({
|
||||||
|
title: 'Stdin Input',
|
||||||
|
content: data.trim(),
|
||||||
|
sourceType: 'text',
|
||||||
|
metadata: {
|
||||||
|
source: 'stdin',
|
||||||
|
fetchedAt: Date.now(),
|
||||||
|
},
|
||||||
|
});
|
||||||
|
});
|
||||||
|
process.stdin.on('error', reject);
|
||||||
|
|
||||||
|
// Timeout for stdin
|
||||||
|
setTimeout(() => {
|
||||||
|
if (!data) {
|
||||||
|
reject(new Error('No input received from stdin'));
|
||||||
|
}
|
||||||
|
}, 5000);
|
||||||
|
});
|
||||||
|
}
|
||||||
162
src/core/ingest/index.ts
Normal file
162
src/core/ingest/index.ts
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
import * as crypto from 'crypto';
|
||||||
|
import { addNode, addEdge, listNodes, query } from '../store';
|
||||||
|
import { Node } from '../../types';
|
||||||
|
import { detectSourceType, fetchContent, fetchFromStdin, FetchedContent, SourceType } from './fetchers';
|
||||||
|
import { chunkContent, ChunkOptions, Chunk } from './chunker';
|
||||||
|
|
||||||
|
export interface IngestOptions {
|
||||||
|
title?: string;
|
||||||
|
tags?: string[];
|
||||||
|
chunkStrategy?: ChunkOptions;
|
||||||
|
noLink?: boolean;
|
||||||
|
stdin?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface IngestResult {
|
||||||
|
success: boolean;
|
||||||
|
sourceType: SourceType;
|
||||||
|
title: string;
|
||||||
|
nodeCount: number;
|
||||||
|
nodes: { id: string; title: string }[];
|
||||||
|
parentId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function ingest(source: string, options: IngestOptions = {}): Promise<IngestResult> {
|
||||||
|
// Fetch content
|
||||||
|
let fetched: FetchedContent;
|
||||||
|
|
||||||
|
if (options.stdin) {
|
||||||
|
fetched = await fetchFromStdin();
|
||||||
|
} else {
|
||||||
|
const sourceType = detectSourceType(source);
|
||||||
|
fetched = await fetchContent(source, sourceType);
|
||||||
|
}
|
||||||
|
|
||||||
|
const title = options.title || fetched.title;
|
||||||
|
const tags = options.tags || [];
|
||||||
|
|
||||||
|
// Compute checksum for deduplication
|
||||||
|
const checksum = crypto.createHash('md5').update(fetched.content).digest('hex');
|
||||||
|
|
||||||
|
// Check for duplicates
|
||||||
|
const existingDupe = listNodes({ kind: 'memory', tags: ['ingested'] })
|
||||||
|
.find(n => (n.metadata as any)?.source?.checksum === checksum);
|
||||||
|
|
||||||
|
if (existingDupe) {
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
sourceType: fetched.sourceType,
|
||||||
|
title,
|
||||||
|
nodeCount: 0,
|
||||||
|
nodes: [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Chunk content if needed
|
||||||
|
const chunks = chunkContent(fetched.content, options.chunkStrategy);
|
||||||
|
const nodes: Node[] = [];
|
||||||
|
|
||||||
|
if (chunks.length === 1) {
|
||||||
|
// Single node
|
||||||
|
const node = await addNode({
|
||||||
|
kind: 'memory',
|
||||||
|
title,
|
||||||
|
content: chunks[0].content,
|
||||||
|
tags: ['ingested', fetched.sourceType, ...tags],
|
||||||
|
metadata: {
|
||||||
|
source: {
|
||||||
|
type: fetched.sourceType,
|
||||||
|
...fetched.metadata,
|
||||||
|
checksum,
|
||||||
|
},
|
||||||
|
tokenEstimate: chunks[0].tokenEstimate,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
nodes.push(node);
|
||||||
|
} else {
|
||||||
|
// Create parent node with summary
|
||||||
|
const summaryContent = `Ingested content from ${fetched.sourceType} source.\n\n` +
|
||||||
|
`**Source:** ${fetched.metadata.url || fetched.metadata.filePath || 'stdin'}\n` +
|
||||||
|
`**Chunks:** ${chunks.length}\n` +
|
||||||
|
`**Total tokens:** ~${chunks.reduce((sum, c) => sum + c.tokenEstimate, 0)}\n\n` +
|
||||||
|
`## Preview\n\n${fetched.content.slice(0, 500)}...`;
|
||||||
|
|
||||||
|
const parentNode = await addNode({
|
||||||
|
kind: 'memory',
|
||||||
|
title,
|
||||||
|
content: summaryContent,
|
||||||
|
tags: ['ingested', fetched.sourceType, 'parent', ...tags],
|
||||||
|
metadata: {
|
||||||
|
source: {
|
||||||
|
type: fetched.sourceType,
|
||||||
|
...fetched.metadata,
|
||||||
|
checksum,
|
||||||
|
},
|
||||||
|
chunkCount: chunks.length,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
nodes.push(parentNode);
|
||||||
|
|
||||||
|
// Create child nodes for each chunk
|
||||||
|
for (const chunk of chunks) {
|
||||||
|
const chunkNode = await addNode({
|
||||||
|
kind: 'memory',
|
||||||
|
title: `${title} (Part ${chunk.index + 1}/${chunks.length})`,
|
||||||
|
content: chunk.content,
|
||||||
|
tags: ['ingested', fetched.sourceType, 'chunk', ...tags],
|
||||||
|
metadata: {
|
||||||
|
parentId: parentNode.id,
|
||||||
|
chunkIndex: chunk.index,
|
||||||
|
tokenEstimate: chunk.tokenEstimate,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
nodes.push(chunkNode);
|
||||||
|
addEdge(parentNode.id, chunkNode.id, 'contains');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto-link to related nodes (if not disabled)
|
||||||
|
if (!options.noLink) {
|
||||||
|
for (const node of nodes) {
|
||||||
|
await linkRelatedNodes(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
sourceType: fetched.sourceType,
|
||||||
|
title,
|
||||||
|
nodeCount: nodes.length,
|
||||||
|
nodes: nodes.map(n => ({ id: n.id, title: n.title })),
|
||||||
|
parentId: chunks.length > 1 ? nodes[0].id : undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function linkRelatedNodes(node: Node): Promise<void> {
|
||||||
|
// Search for related nodes based on content
|
||||||
|
const searchText = node.title + ' ' + node.content.slice(0, 500);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const related = await query(searchText, { limit: 5 });
|
||||||
|
|
||||||
|
for (const result of related) {
|
||||||
|
// Don't link to self or siblings from same ingestion
|
||||||
|
if (result.node.id === node.id) continue;
|
||||||
|
if ((result.node.metadata as any)?.source?.checksum === (node.metadata as any)?.source?.checksum) continue;
|
||||||
|
|
||||||
|
// Only link if relevance is high enough
|
||||||
|
if (result.score > 0.5) {
|
||||||
|
try {
|
||||||
|
addEdge(node.id, result.node.id, 'relates_to', { reason: 'semantic-similarity', score: result.score });
|
||||||
|
} catch {
|
||||||
|
// Edge might already exist
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Search might fail, that's okay
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export { detectSourceType, fetchContent, fetchFromStdin, SourceType } from './fetchers';
|
||||||
|
export { chunkContent, estimateTokens, ChunkOptions, Chunk } from './chunker';
|
||||||
@@ -650,6 +650,56 @@ server.tool(
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// --- memory_ingest ---
|
||||||
|
import { ingest } from '../core/ingest';
|
||||||
|
|
||||||
|
server.tool(
|
||||||
|
'memory_ingest',
|
||||||
|
'Ingest content from a URL or text into the knowledge graph',
|
||||||
|
{
|
||||||
|
source: z.string().describe('URL or raw text to ingest'),
|
||||||
|
title: z.string().optional().describe('Override title'),
|
||||||
|
tags: z.array(z.string()).optional().describe('Tags to apply'),
|
||||||
|
isUrl: z.boolean().optional().describe('Treat source as URL (auto-detected if not specified)'),
|
||||||
|
},
|
||||||
|
async ({ source, title, tags, isUrl }) => {
|
||||||
|
// If explicitly not a URL, or doesn't look like a URL, treat as raw text
|
||||||
|
const isSourceUrl = isUrl ?? (source.startsWith('http://') || source.startsWith('https://'));
|
||||||
|
|
||||||
|
if (!isSourceUrl) {
|
||||||
|
// Treat as raw text - create a simple memory node
|
||||||
|
const node = await addNode({
|
||||||
|
kind: 'memory',
|
||||||
|
title: title || 'Ingested Content',
|
||||||
|
content: source,
|
||||||
|
tags: ['ingested', 'text', ...(tags || [])],
|
||||||
|
metadata: { source: { type: 'text', ingestedAt: Date.now() } },
|
||||||
|
});
|
||||||
|
return { content: [{ type: 'text' as const, text: serialize({ success: true, nodeId: node.id, title: node.title }) }] };
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await ingest(source, { title, tags });
|
||||||
|
return { content: [{ type: 'text' as const, text: serialize(result) }] };
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
server.tool(
|
||||||
|
'memory_clip',
|
||||||
|
'Quick clip a URL into memory',
|
||||||
|
{
|
||||||
|
url: z.string().describe('URL to clip'),
|
||||||
|
title: z.string().optional().describe('Override title'),
|
||||||
|
tags: z.array(z.string()).optional().describe('Tags to apply'),
|
||||||
|
},
|
||||||
|
async ({ url, title, tags }) => {
|
||||||
|
if (!url.startsWith('http://') && !url.startsWith('https://')) {
|
||||||
|
return { content: [{ type: 'text' as const, text: serialize({ error: 'Invalid URL' }) }], isError: true };
|
||||||
|
}
|
||||||
|
const result = await ingest(url, { title, tags });
|
||||||
|
return { content: [{ type: 'text' as const, text: serialize(result) }] };
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
// --- memory_index ---
|
// --- memory_index ---
|
||||||
import { indexProject } from '../core/indexer';
|
import { indexProject } from '../core/indexer';
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user