finalize M06-M12 with Go/SQLite/Synology NAS implementation decisions
This commit is contained in:
@@ -1,8 +1,54 @@
|
||||
# Milestone 8: Telemetry System
|
||||
|
||||
**Status**: Planning
|
||||
**Status**: Decided
|
||||
**Goal**: Collect app usage analytics and crash reports while respecting privacy.
|
||||
|
||||
## Decision
|
||||
|
||||
**SQLite with background aggregation** for self-hosted Synology NAS:
|
||||
|
||||
```
|
||||
Storage: SQLite (separate telemetry.db to isolate write load)
|
||||
Aggregation: Go background goroutine (hourly/daily rollups)
|
||||
Retention: Raw events 7 days, aggregates indefinitely
|
||||
Privacy: Hashed device IDs, no PII, opt-out available
|
||||
```
|
||||
|
||||
### Rationale
|
||||
|
||||
1. **Simple** - No separate time-series database needed
|
||||
2. **SQLite scales** - Can handle thousands of events/day easily
|
||||
3. **Background jobs** - Go goroutines for aggregation, cleanup
|
||||
4. **Separate DB** - Telemetry writes don't affect main portal.db
|
||||
5. **Privacy-first** - Minimal collection, hashed IDs
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ mosis-portal container │
|
||||
│ ┌────────────────────────────────────────────────────────────┐ │
|
||||
│ │ Go Binary │ │
|
||||
│ │ ┌─────────────┐ ┌────────────────┐ │ │
|
||||
│ │ │ API Handler │───►│ Telemetry Svc │ │ │
|
||||
│ │ │ POST /v1/ │ │ - Buffer events│ │ │
|
||||
│ │ │ telemetry/* │ │ - Batch insert │ │ │
|
||||
│ │ └─────────────┘ └───────┬────────┘ │ │
|
||||
│ │ │ │ │
|
||||
│ │ ┌─────────────────────────▼────────────────────────────┐ │ │
|
||||
│ │ │ Background Workers │ │ │
|
||||
│ │ │ • Hourly aggregation (event counts, unique devices) │ │ │
|
||||
│ │ │ • Daily cleanup (delete raw events > 7 days) │ │ │
|
||||
│ │ │ • Crash grouping (fingerprint + dedup) │ │ │
|
||||
│ │ └───────────────────────────────────────────────────────┘ │ │
|
||||
│ └──────────────────────────────┬─────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ /volume1/mosis/data/ │ │
|
||||
│ ├── portal.db (main) │ │
|
||||
│ └── telemetry.db ◄────────────┘ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
@@ -173,61 +219,107 @@ end
|
||||
|
||||
---
|
||||
|
||||
## Storage Options
|
||||
## Storage (SQLite)
|
||||
|
||||
### Option A: PostgreSQL + TimescaleDB
|
||||
### Telemetry Database Schema
|
||||
|
||||
```sql
|
||||
-- Hypertable for time-series data
|
||||
CREATE TABLE telemetry_events (
|
||||
time TIMESTAMPTZ NOT NULL,
|
||||
app_id TEXT NOT NULL,
|
||||
device_id TEXT NOT NULL,
|
||||
session_id TEXT,
|
||||
event_type TEXT NOT NULL,
|
||||
event_data JSONB,
|
||||
-- telemetry.db (separate from portal.db)
|
||||
|
||||
-- Raw events (7-day retention)
|
||||
CREATE TABLE events (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
app_id TEXT NOT NULL,
|
||||
device_id TEXT NOT NULL, -- SHA256 hashed
|
||||
session_id TEXT,
|
||||
event_type TEXT NOT NULL,
|
||||
event_data TEXT, -- JSON string
|
||||
app_version TEXT,
|
||||
mosis_version TEXT
|
||||
mosis_version TEXT,
|
||||
timestamp TEXT NOT NULL -- ISO8601
|
||||
);
|
||||
|
||||
SELECT create_hypertable('telemetry_events', 'time');
|
||||
CREATE INDEX idx_events_app_time ON events(app_id, timestamp);
|
||||
CREATE INDEX idx_events_type ON events(event_type, timestamp);
|
||||
|
||||
-- Continuous aggregate for daily stats
|
||||
CREATE MATERIALIZED VIEW daily_stats
|
||||
WITH (timescaledb.continuous) AS
|
||||
SELECT
|
||||
time_bucket('1 day', time) AS day,
|
||||
app_id,
|
||||
event_type,
|
||||
COUNT(*) as count,
|
||||
COUNT(DISTINCT device_id) as unique_devices
|
||||
FROM telemetry_events
|
||||
GROUP BY day, app_id, event_type;
|
||||
-- Hourly aggregates (computed by background job)
|
||||
CREATE TABLE hourly_stats (
|
||||
app_id TEXT NOT NULL,
|
||||
hour TEXT NOT NULL, -- YYYY-MM-DDTHH
|
||||
event_type TEXT NOT NULL,
|
||||
count INTEGER NOT NULL,
|
||||
unique_devices INTEGER NOT NULL,
|
||||
PRIMARY KEY (app_id, hour, event_type)
|
||||
);
|
||||
|
||||
-- Daily aggregates (computed from hourly)
|
||||
CREATE TABLE daily_stats (
|
||||
app_id TEXT NOT NULL,
|
||||
date TEXT NOT NULL, -- YYYY-MM-DD
|
||||
event_type TEXT NOT NULL,
|
||||
count INTEGER NOT NULL,
|
||||
unique_devices INTEGER NOT NULL,
|
||||
PRIMARY KEY (app_id, date, event_type)
|
||||
);
|
||||
|
||||
-- Crash groups (deduplicated by fingerprint)
|
||||
CREATE TABLE crash_groups (
|
||||
id TEXT PRIMARY KEY,
|
||||
app_id TEXT NOT NULL,
|
||||
fingerprint TEXT NOT NULL,
|
||||
crash_type TEXT NOT NULL,
|
||||
message TEXT,
|
||||
sample_stack_trace TEXT,
|
||||
first_seen TEXT NOT NULL,
|
||||
last_seen TEXT NOT NULL,
|
||||
occurrence_count INTEGER DEFAULT 1,
|
||||
affected_versions TEXT, -- JSON array
|
||||
status TEXT DEFAULT 'open',
|
||||
UNIQUE(app_id, fingerprint)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_crashes_app ON crash_groups(app_id, status);
|
||||
```
|
||||
|
||||
### Option B: ClickHouse
|
||||
### Go Background Workers
|
||||
|
||||
```sql
|
||||
CREATE TABLE telemetry_events (
|
||||
timestamp DateTime,
|
||||
app_id String,
|
||||
device_id String,
|
||||
session_id String,
|
||||
event_type String,
|
||||
event_data String, -- JSON
|
||||
app_version String,
|
||||
mosis_version String
|
||||
) ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(timestamp)
|
||||
ORDER BY (app_id, timestamp);
|
||||
```
|
||||
```go
|
||||
// Start background workers
|
||||
func (s *TelemetryService) StartWorkers(ctx context.Context) {
|
||||
// Hourly aggregation
|
||||
go s.runPeriodic(ctx, time.Hour, s.aggregateHourly)
|
||||
|
||||
### Option C: Custom + PostgreSQL
|
||||
// Daily aggregation (run at 2am)
|
||||
go s.runDaily(ctx, 2, s.aggregateDaily)
|
||||
|
||||
```
|
||||
Raw events → Write to append-only log
|
||||
Aggregator → Process hourly → Write to PostgreSQL
|
||||
Cleanup → Delete raw after 24h
|
||||
// Cleanup old events (run at 3am)
|
||||
go s.runDaily(ctx, 3, s.cleanupOldEvents)
|
||||
}
|
||||
|
||||
func (s *TelemetryService) aggregateHourly(ctx context.Context) error {
|
||||
hour := time.Now().Add(-time.Hour).Format("2006-01-02T15")
|
||||
|
||||
_, err := s.db.ExecContext(ctx, `
|
||||
INSERT OR REPLACE INTO hourly_stats (app_id, hour, event_type, count, unique_devices)
|
||||
SELECT
|
||||
app_id,
|
||||
strftime('%Y-%m-%dT%H', timestamp) as hour,
|
||||
event_type,
|
||||
COUNT(*) as count,
|
||||
COUNT(DISTINCT device_id) as unique_devices
|
||||
FROM events
|
||||
WHERE strftime('%Y-%m-%dT%H', timestamp) = ?
|
||||
GROUP BY app_id, hour, event_type
|
||||
`, hour)
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *TelemetryService) cleanupOldEvents(ctx context.Context) error {
|
||||
cutoff := time.Now().AddDate(0, 0, -7).Format(time.RFC3339)
|
||||
_, err := s.db.ExecContext(ctx,
|
||||
"DELETE FROM events WHERE timestamp < ?", cutoff)
|
||||
return err
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
@@ -474,25 +566,26 @@ DELETE /v1/privacy/data:
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [x] Storage approach decided (SQLite with separate telemetry.db)
|
||||
- [ ] Event schema specification
|
||||
- [ ] Client-side SDK for batching
|
||||
- [ ] Ingestion API endpoints
|
||||
- [ ] Storage setup (TimescaleDB or ClickHouse)
|
||||
- [ ] Aggregation jobs
|
||||
- [ ] Client-side batching (Lua TelemetryManager)
|
||||
- [ ] Ingestion API endpoints (Go + Chi)
|
||||
- [ ] SQLite schema and migrations
|
||||
- [ ] Background aggregation workers (Go goroutines)
|
||||
- [ ] Crash grouping logic
|
||||
- [ ] Developer dashboard
|
||||
- [ ] Privacy controls
|
||||
- [ ] Data retention automation
|
||||
- [ ] GDPR export/delete
|
||||
- [ ] Developer analytics dashboard (htmx)
|
||||
- [ ] Privacy controls (opt-out in manifest)
|
||||
- [ ] Data retention cleanup job
|
||||
- [ ] GDPR export/delete endpoints
|
||||
|
||||
---
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. Real-time crash alerts (email/Slack)?
|
||||
2. Sampling for high-volume apps?
|
||||
3. Custom events API for developers?
|
||||
4. Benchmarks/comparisons with similar apps?
|
||||
1. Real-time crash alerts? → Consider email notifications for v1.1
|
||||
2. ~~Sampling for high-volume apps?~~ → Not needed for self-hosted scale
|
||||
3. ~~Custom events API for developers?~~ → Yes, via manifest opt-in
|
||||
4. ~~Benchmarks/comparisons with similar apps?~~ → Defer to post-MVP
|
||||
|
||||
---
|
||||
|
||||
|
||||
Reference in New Issue
Block a user