diff --git a/src/cli/commands/ingest.ts b/src/cli/commands/ingest.ts
new file mode 100644
index 0000000..3a7b8b5
--- /dev/null
+++ b/src/cli/commands/ingest.ts
@@ -0,0 +1,95 @@
+import { Command } from 'commander';
+import chalk from 'chalk';
+import { ingest } from '../../core/ingest';
+
+export const ingestCommand = new Command('ingest')
+ .description('Ingest content from URLs, files, or stdin into the knowledge graph')
+ .argument('[source]', 'URL or file path to ingest')
+ .option('-t, --title
', 'Override title')
+ .option('--tags ', 'Tags to apply (comma-separated)')
+ .option('--stdin', 'Read content from stdin')
+ .option('--chunk-size ', 'Max tokens per chunk (default: 1000)')
+ .option('--no-link', 'Skip auto-linking to related nodes')
+ .action(async (source: string | undefined, opts) => {
+ try {
+ if (!source && !opts.stdin) {
+ console.error(chalk.red('Error: Provide a source URL/file or use --stdin'));
+ process.exit(1);
+ }
+
+ if (opts.stdin) {
+ console.log(chalk.cyan('Reading from stdin... (Ctrl+D to end)'));
+ } else {
+ console.log(chalk.cyan(`Ingesting: ${source}`));
+ }
+
+ const result = await ingest(source || '', {
+ title: opts.title,
+ tags: opts.tags?.split(',').map((t: string) => t.trim()),
+ stdin: opts.stdin,
+ noLink: !opts.link,
+ chunkStrategy: opts.chunkSize ? {
+ maxTokens: parseInt(opts.chunkSize),
+ } : undefined,
+ });
+
+ if (!result.success) {
+ console.log(chalk.yellow('Content already exists (duplicate checksum)'));
+ return;
+ }
+
+ console.log();
+ console.log(chalk.green(`✓ Ingested: ${result.title}`));
+ console.log();
+ console.log(` Type: ${result.sourceType}`);
+ console.log(` Nodes: ${result.nodeCount}`);
+
+ if (result.parentId) {
+ console.log(` Parent: ${result.parentId.slice(0, 8)}`);
+ }
+
+ for (const node of result.nodes.slice(0, 5)) {
+ console.log(chalk.dim(` - ${node.id.slice(0, 8)} ${node.title}`));
+ }
+
+ if (result.nodes.length > 5) {
+ console.log(chalk.dim(` ... and ${result.nodes.length - 5} more`));
+ }
+ } catch (err: any) {
+ console.error(chalk.red(`Error: ${err.message}`));
+ process.exit(1);
+ }
+ });
+
+// Alias for quick URL clipping
+export const clipCommand = new Command('clip')
+ .description('Quick clip a URL (alias for ingest)')
+ .argument('', 'URL to clip')
+ .option('-t, --title ', 'Override title')
+ .option('--tags ', 'Tags to apply (comma-separated)')
+ .action(async (url: string, opts) => {
+ try {
+ if (!url.startsWith('http://') && !url.startsWith('https://')) {
+ console.error(chalk.red('Error: clip expects a URL'));
+ process.exit(1);
+ }
+
+ console.log(chalk.cyan(`Clipping: ${url}`));
+
+ const result = await ingest(url, {
+ title: opts.title,
+ tags: opts.tags?.split(',').map((t: string) => t.trim()),
+ });
+
+ if (!result.success) {
+ console.log(chalk.yellow('Already clipped (duplicate)'));
+ return;
+ }
+
+ console.log(chalk.green(`✓ ${result.title}`));
+ console.log(chalk.dim(` ${result.nodes[0].id.slice(0, 8)}`));
+ } catch (err: any) {
+ console.error(chalk.red(`Error: ${err.message}`));
+ process.exit(1);
+ }
+ });
diff --git a/src/cli/index.ts b/src/cli/index.ts
index a802482..b1c46dd 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -18,6 +18,7 @@ import { captureCommand, captureHookCommand, configCommand } from './commands/ca
import { contextCommand, contextHookCommand } from './commands/context';
import { indexCommand } from './commands/index-cmd';
import { journalCommand, journalAliasCommand, quickCaptureCommand } from './commands/journal';
+import { ingestCommand, clipCommand } from './commands/ingest';
import { closeDb } from '../core/db';
const program = new Command();
@@ -50,6 +51,8 @@ program.addCommand(indexCommand);
program.addCommand(journalCommand);
program.addCommand(journalAliasCommand);
program.addCommand(quickCaptureCommand);
+program.addCommand(ingestCommand);
+program.addCommand(clipCommand);
program.hook('postAction', () => {
closeDb();
diff --git a/src/core/ingest/chunker.ts b/src/core/ingest/chunker.ts
new file mode 100644
index 0000000..a211753
--- /dev/null
+++ b/src/core/ingest/chunker.ts
@@ -0,0 +1,93 @@
+export interface ChunkOptions {
+ maxTokens?: number;
+ overlap?: number;
+ splitOn?: 'paragraph' | 'sentence' | 'heading';
+}
+
+export interface Chunk {
+ index: number;
+ content: string;
+ tokenEstimate: number;
+}
+
+const DEFAULT_MAX_TOKENS = 1000;
+const DEFAULT_OVERLAP = 100;
+const CHARS_PER_TOKEN = 4; // Rough estimate
+
+export function chunkContent(content: string, options: ChunkOptions = {}): Chunk[] {
+ const maxTokens = options.maxTokens || DEFAULT_MAX_TOKENS;
+ const overlap = options.overlap || DEFAULT_OVERLAP;
+ const splitOn = options.splitOn || 'paragraph';
+
+ const maxChars = maxTokens * CHARS_PER_TOKEN;
+ const overlapChars = overlap * CHARS_PER_TOKEN;
+
+ // If content is small enough, return as single chunk
+ if (content.length <= maxChars) {
+ return [{
+ index: 0,
+ content,
+ tokenEstimate: Math.ceil(content.length / CHARS_PER_TOKEN),
+ }];
+ }
+
+ // Split into segments based on strategy
+ const segments = splitContent(content, splitOn);
+
+ // Combine segments into chunks
+ const chunks: Chunk[] = [];
+ let currentChunk = '';
+ let chunkIndex = 0;
+
+ for (const segment of segments) {
+ if (currentChunk.length + segment.length > maxChars && currentChunk.length > 0) {
+ // Save current chunk
+ chunks.push({
+ index: chunkIndex++,
+ content: currentChunk.trim(),
+ tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
+ });
+
+ // Start new chunk with overlap
+ if (overlapChars > 0 && currentChunk.length > overlapChars) {
+ currentChunk = currentChunk.slice(-overlapChars) + segment;
+ } else {
+ currentChunk = segment;
+ }
+ } else {
+ currentChunk += segment;
+ }
+ }
+
+ // Don't forget the last chunk
+ if (currentChunk.trim()) {
+ chunks.push({
+ index: chunkIndex,
+ content: currentChunk.trim(),
+ tokenEstimate: Math.ceil(currentChunk.length / CHARS_PER_TOKEN),
+ });
+ }
+
+ return chunks;
+}
+
+function splitContent(content: string, strategy: 'paragraph' | 'sentence' | 'heading'): string[] {
+ switch (strategy) {
+ case 'heading':
+ // Split on markdown headings
+ return content.split(/(?=^#{1,6}\s)/m).filter(s => s.trim());
+
+ case 'sentence':
+ // Split on sentence boundaries
+ return content.split(/(?<=[.!?])\s+/).filter(s => s.trim());
+
+ case 'paragraph':
+ default:
+ // Split on double newlines (paragraphs)
+ return content.split(/\n\n+/).filter(s => s.trim()).map(s => s + '\n\n');
+ }
+}
+
+export function estimateTokens(text: string): number {
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
+}
diff --git a/src/core/ingest/fetchers.ts b/src/core/ingest/fetchers.ts
new file mode 100644
index 0000000..4d838ff
--- /dev/null
+++ b/src/core/ingest/fetchers.ts
@@ -0,0 +1,252 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import * as https from 'https';
+import * as http from 'http';
+
+export type SourceType = 'url' | 'pdf' | 'markdown' | 'text' | 'html';
+
+export interface FetchedContent {
+ title: string;
+ content: string;
+ sourceType: SourceType;
+ metadata: Record;
+}
+
+export function detectSourceType(source: string): SourceType {
+ if (source.startsWith('http://') || source.startsWith('https://')) {
+ return 'url';
+ }
+ const ext = path.extname(source).toLowerCase();
+ switch (ext) {
+ case '.pdf': return 'pdf';
+ case '.md': return 'markdown';
+ case '.html': case '.htm': return 'html';
+ default: return 'text';
+ }
+}
+
+export async function fetchContent(source: string, sourceType: SourceType): Promise {
+ switch (sourceType) {
+ case 'url':
+ return fetchUrl(source);
+ case 'pdf':
+ return fetchPdf(source);
+ case 'markdown':
+ case 'text':
+ case 'html':
+ return fetchFile(source, sourceType);
+ default:
+ throw new Error(`Unsupported source type: ${sourceType}`);
+ }
+}
+
+async function fetchUrl(url: string): Promise {
+ const html = await httpGet(url);
+
+ // Extract title from HTML
+ const titleMatch = html.match(/]*>([^<]+)<\/title>/i);
+ const title = titleMatch?.[1]?.trim() || new URL(url).hostname;
+
+ // Convert HTML to readable text
+ const content = htmlToText(html);
+
+ return {
+ title,
+ content,
+ sourceType: 'url',
+ metadata: {
+ url,
+ fetchedAt: Date.now(),
+ },
+ };
+}
+
+function httpGet(url: string): Promise {
+ return new Promise((resolve, reject) => {
+ const client = url.startsWith('https') ? https : http;
+ const req = client.get(url, {
+ headers: {
+ 'User-Agent': 'Mozilla/5.0 (compatible; Cortex/1.0)',
+ 'Accept': 'text/html,application/xhtml+xml,text/plain,*/*',
+ },
+ timeout: 30000,
+ }, (res) => {
+ // Handle redirects
+ if (res.statusCode && res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
+ const redirectUrl = new URL(res.headers.location, url).toString();
+ resolve(httpGet(redirectUrl));
+ return;
+ }
+
+ if (res.statusCode && res.statusCode >= 400) {
+ reject(new Error(`HTTP ${res.statusCode}`));
+ return;
+ }
+
+ let data = '';
+ res.on('data', chunk => data += chunk);
+ res.on('end', () => resolve(data));
+ });
+ req.on('error', reject);
+ req.on('timeout', () => {
+ req.destroy();
+ reject(new Error('Request timeout'));
+ });
+ });
+}
+
+function htmlToText(html: string): string {
+ // Remove scripts, styles, and other non-content elements
+ let text = html
+ .replace(/