Add development plan with 13 milestone specifications
- docs/plan.md: Master roadmap with phases and priorities - docs/milestones/01-13: Detailed specs for each feature - Updated CLAUDE.md with plan references and build commands Milestones cover: - Phase 1: Temporal versioning, auto-capture, context injection, codebase indexing - Phase 2: Daily journal, content ingestion, graph visualization, import/export - Phase 3: Multi-graph, smart retrieval, TUI dashboard, browser extension, shell completions
This commit is contained in:
264
docs/milestones/06-content-ingestion.md
Normal file
264
docs/milestones/06-content-ingestion.md
Normal file
@@ -0,0 +1,264 @@
|
||||
# Milestone 6: URL & Content Ingestion
|
||||
|
||||
## Overview
|
||||
|
||||
Ingest content from URLs, PDFs, and documents into the knowledge graph. Automatically chunk, summarize, and link to existing knowledge.
|
||||
|
||||
## Motivation
|
||||
|
||||
- Knowledge exists outside the codebase (docs, articles, specs)
|
||||
- Manual copy-paste is tedious and loses structure
|
||||
- Supermemory's multi-source ingestion is key feature
|
||||
- Research and documentation should be first-class
|
||||
|
||||
## Features
|
||||
|
||||
### 6.1 URL Ingestion
|
||||
|
||||
```bash
|
||||
# Ingest a webpage
|
||||
cortex ingest https://docs.example.com/api
|
||||
|
||||
# Ingest with custom title
|
||||
cortex ingest https://... --title "API Documentation"
|
||||
|
||||
# Ingest and tag
|
||||
cortex ingest https://... --tags docs,api,reference
|
||||
```
|
||||
|
||||
### 6.2 PDF Ingestion
|
||||
|
||||
```bash
|
||||
# Ingest a PDF
|
||||
cortex ingest ./spec.pdf
|
||||
|
||||
# Ingest specific pages
|
||||
cortex ingest ./spec.pdf --pages 1-10
|
||||
|
||||
# Ingest with chunking strategy
|
||||
cortex ingest ./spec.pdf --chunk-size 1000
|
||||
```
|
||||
|
||||
### 6.3 Markdown/Text Ingestion
|
||||
|
||||
```bash
|
||||
# Ingest markdown file
|
||||
cortex ingest ./notes.md
|
||||
|
||||
# Ingest from stdin
|
||||
cat notes.txt | cortex ingest --stdin
|
||||
|
||||
# Ingest clipboard
|
||||
cortex ingest --clipboard
|
||||
```
|
||||
|
||||
### 6.4 Smart Chunking
|
||||
|
||||
Large documents are split intelligently:
|
||||
|
||||
```typescript
|
||||
interface ChunkStrategy {
|
||||
maxTokens: number; // Max tokens per chunk
|
||||
overlap: number; // Overlap between chunks
|
||||
splitOn: 'paragraph' | 'sentence' | 'heading' | 'page';
|
||||
preserveStructure: boolean;
|
||||
}
|
||||
```
|
||||
|
||||
### 6.5 Entity Extraction
|
||||
|
||||
Extract and link entities:
|
||||
|
||||
```typescript
|
||||
interface ExtractedEntities {
|
||||
people: string[];
|
||||
organizations: string[];
|
||||
technologies: string[];
|
||||
concepts: string[];
|
||||
}
|
||||
|
||||
// Auto-link to existing nodes with matching titles/tags
|
||||
```
|
||||
|
||||
### 6.6 Source Tracking
|
||||
|
||||
Track where content came from:
|
||||
|
||||
```typescript
|
||||
metadata: {
|
||||
source: {
|
||||
type: 'url' | 'pdf' | 'file' | 'clipboard';
|
||||
url?: string;
|
||||
filePath?: string;
|
||||
ingestedAt: number;
|
||||
checksum: string; // For deduplication
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Implementation
|
||||
|
||||
### Ingestion Pipeline
|
||||
|
||||
```typescript
|
||||
// src/core/ingest/index.ts
|
||||
export async function ingest(source: string, options: IngestOptions): Promise<IngestResult> {
|
||||
// Detect source type
|
||||
const sourceType = detectSourceType(source);
|
||||
|
||||
// Fetch/read content
|
||||
const rawContent = await fetchContent(source, sourceType);
|
||||
|
||||
// Convert to markdown
|
||||
const markdown = await convertToMarkdown(rawContent, sourceType);
|
||||
|
||||
// Chunk if needed
|
||||
const chunks = chunkContent(markdown, options.chunkStrategy);
|
||||
|
||||
// Create nodes
|
||||
const nodes: Node[] = [];
|
||||
|
||||
if (chunks.length === 1) {
|
||||
// Single node
|
||||
const node = await createIngestNode(chunks[0], source, options);
|
||||
nodes.push(node);
|
||||
} else {
|
||||
// Parent + children
|
||||
const parent = await createParentNode(source, chunks, options);
|
||||
nodes.push(parent);
|
||||
|
||||
for (const chunk of chunks) {
|
||||
const child = await createChunkNode(chunk, parent.id, options);
|
||||
nodes.push(child);
|
||||
addEdge(parent.id, child.id, 'contains');
|
||||
}
|
||||
}
|
||||
|
||||
// Extract and link entities
|
||||
for (const node of nodes) {
|
||||
await extractAndLinkEntities(node);
|
||||
}
|
||||
|
||||
// Find and link related nodes
|
||||
for (const node of nodes) {
|
||||
await linkRelatedNodes(node);
|
||||
}
|
||||
|
||||
return { nodes: nodes.length, source: sourceType };
|
||||
}
|
||||
```
|
||||
|
||||
### URL Fetcher
|
||||
|
||||
```typescript
|
||||
// src/core/ingest/fetchers/url.ts
|
||||
export async function fetchUrl(url: string): Promise<FetchedContent> {
|
||||
const response = await fetch(url);
|
||||
const html = await response.text();
|
||||
|
||||
// Use readability to extract main content
|
||||
const doc = new JSDOM(html);
|
||||
const reader = new Readability(doc.window.document);
|
||||
const article = reader.parse();
|
||||
|
||||
return {
|
||||
title: article?.title || url,
|
||||
content: article?.textContent || '',
|
||||
html: article?.content || html,
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### PDF Parser
|
||||
|
||||
```typescript
|
||||
// src/core/ingest/fetchers/pdf.ts
|
||||
export async function parsePdf(filePath: string, options?: PdfOptions): Promise<ParsedPdf> {
|
||||
// Use pdf-parse or pdfjs-dist
|
||||
const dataBuffer = fs.readFileSync(filePath);
|
||||
const data = await pdfParse(dataBuffer);
|
||||
|
||||
return {
|
||||
text: data.text,
|
||||
pages: data.numpages,
|
||||
metadata: data.info,
|
||||
};
|
||||
}
|
||||
```
|
||||
|
||||
### Markdown Converter
|
||||
|
||||
```typescript
|
||||
// src/core/ingest/convert.ts
|
||||
export async function convertToMarkdown(content: FetchedContent, type: SourceType): Promise<string> {
|
||||
switch (type) {
|
||||
case 'url':
|
||||
return turndown.turndown(content.html);
|
||||
case 'pdf':
|
||||
return content.text; // Already text
|
||||
case 'markdown':
|
||||
return content.content;
|
||||
default:
|
||||
return content.content;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## CLI Commands
|
||||
|
||||
| Command | Description |
|
||||
|---------|-------------|
|
||||
| `cortex ingest <source>` | Ingest URL, file, or path |
|
||||
| `cortex ingest --clipboard` | Ingest from clipboard |
|
||||
| `cortex ingest --stdin` | Ingest from stdin |
|
||||
| `cortex ingest --title <title>` | Override title |
|
||||
| `cortex ingest --tags <tags>` | Add tags |
|
||||
| `cortex ingest --chunk-size <n>` | Set chunk size |
|
||||
| `cortex ingest --no-link` | Skip auto-linking |
|
||||
|
||||
## MCP Tools
|
||||
|
||||
```typescript
|
||||
memory_ingest // Ingest URL or content
|
||||
memory_clip // Quick clip from URL
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
- [ ] URL ingestion extracts main content
|
||||
- [ ] PDF parsing handles multi-page docs
|
||||
- [ ] Chunking preserves context
|
||||
- [ ] Entities extracted and linked
|
||||
- [ ] Duplicate content detected
|
||||
- [ ] Source metadata preserved
|
||||
|
||||
## Acceptance Criteria
|
||||
|
||||
- [ ] URLs ingested with readable extraction
|
||||
- [ ] PDFs parsed into searchable text
|
||||
- [ ] Large docs chunked intelligently
|
||||
- [ ] Related nodes auto-linked
|
||||
- [ ] Source tracked for reference
|
||||
- [ ] Deduplication prevents duplicates
|
||||
|
||||
## Estimated Effort
|
||||
|
||||
- URL fetcher + Readability: 4 hours
|
||||
- PDF parser: 4 hours
|
||||
- Chunking strategy: 3 hours
|
||||
- Entity extraction: 4 hours
|
||||
- Auto-linking: 3 hours
|
||||
- CLI commands: 2 hours
|
||||
- Testing: 3 hours
|
||||
- **Total: ~23 hours**
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `@mozilla/readability` for URL content extraction
|
||||
- `pdf-parse` or `pdfjs-dist` for PDFs
|
||||
- `turndown` for HTML→Markdown
|
||||
|
||||
## References
|
||||
|
||||
- [Mozilla Readability](https://github.com/mozilla/readability)
|
||||
- [LangChain document loaders](https://js.langchain.com/docs/modules/data_connection/document_loaders/)
|
||||
Reference in New Issue
Block a user