add developer portal planning documentation (M01-M12)
This commit is contained in:
503
DEV_PORTAL_M08_TELEMETRY.md
Normal file
503
DEV_PORTAL_M08_TELEMETRY.md
Normal file
@@ -0,0 +1,503 @@
|
||||
# Milestone 8: Telemetry System
|
||||
|
||||
**Status**: Planning
|
||||
**Goal**: Collect app usage analytics and crash reports while respecting privacy.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Telemetry provides developers with insights into app usage, performance, and crashes. Must balance usefulness with user privacy.
|
||||
|
||||
---
|
||||
|
||||
## Privacy Principles
|
||||
|
||||
1. **Minimal collection** - Only what's necessary
|
||||
2. **No PII by default** - Anonymized device IDs
|
||||
3. **Transparency** - Users know what's collected
|
||||
4. **Opt-out available** - Users can disable
|
||||
5. **Data retention limits** - Auto-delete old data
|
||||
6. **GDPR compliance** - Export/delete on request
|
||||
|
||||
---
|
||||
|
||||
## Event Types
|
||||
|
||||
### Automatic Events (Default)
|
||||
|
||||
| Event | Description | Data |
|
||||
|-------|-------------|------|
|
||||
| `app_start` | App launched | version, mosis_version |
|
||||
| `app_stop` | App closed | duration_seconds |
|
||||
| `app_crash` | Unhandled error | crash_type, message |
|
||||
| `lua_error` | Lua runtime error | message, stack (no user data) |
|
||||
|
||||
### Performance Events (Default)
|
||||
|
||||
| Event | Description | Data |
|
||||
|-------|-------------|------|
|
||||
| `perf_frame` | Frame time (sampled) | avg_ms, p95_ms |
|
||||
| `perf_memory` | Memory usage | used_mb, limit_mb |
|
||||
| `perf_startup` | Startup time | duration_ms |
|
||||
|
||||
### Usage Events (Opt-in)
|
||||
|
||||
| Event | Description | Data |
|
||||
|-------|-------------|------|
|
||||
| `screen_view` | Screen navigation | screen_name |
|
||||
| `button_click` | UI interaction | element_id |
|
||||
| `feature_used` | Feature usage | feature_name |
|
||||
|
||||
---
|
||||
|
||||
## Data Schema
|
||||
|
||||
### Event Payload
|
||||
|
||||
```json
|
||||
{
|
||||
"app_id": "com.developer.myapp",
|
||||
"app_version": "1.2.0",
|
||||
"mosis_version": "1.0.0",
|
||||
"device_id": "sha256_hashed_id",
|
||||
"session_id": "uuid",
|
||||
"events": [
|
||||
{
|
||||
"type": "app_start",
|
||||
"timestamp": "2024-01-15T10:30:00Z",
|
||||
"data": {}
|
||||
},
|
||||
{
|
||||
"type": "screen_view",
|
||||
"timestamp": "2024-01-15T10:30:05Z",
|
||||
"data": {
|
||||
"screen_name": "home"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Crash Report Payload
|
||||
|
||||
```json
|
||||
{
|
||||
"app_id": "com.developer.myapp",
|
||||
"app_version": "1.2.0",
|
||||
"mosis_version": "1.0.0",
|
||||
"device_id": "sha256_hashed_id",
|
||||
"timestamp": "2024-01-15T10:35:00Z",
|
||||
"crash": {
|
||||
"type": "lua_error",
|
||||
"message": "attempt to index nil value 'user'",
|
||||
"stack_trace": "main.lua:42: in function 'loadUser'\nmain.lua:15: in main chunk",
|
||||
"context": {
|
||||
"screen": "profile.rml",
|
||||
"memory_mb": 45,
|
||||
"uptime_seconds": 300
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Device ID Hashing
|
||||
|
||||
```lua
|
||||
-- On device
|
||||
local raw_id = get_android_id() -- or similar
|
||||
local hashed = sha256(raw_id .. "mosis_salt_" .. app_id)
|
||||
-- Result: "a3f2b1c4d5e6..."
|
||||
|
||||
-- Cannot reverse to original device ID
|
||||
-- Different per app (can't track across apps)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Collection Architecture
|
||||
|
||||
```
|
||||
┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||
│ Device │────►│ Batch │────►│ API │────►│ Storage │
|
||||
│ │ │ Queue │ │ │ │ │
|
||||
└──────────┘ └──────────┘ └──────────┘ └──────────┘
|
||||
│
|
||||
│ Every 60s or
|
||||
│ on app close
|
||||
▼
|
||||
┌──────────┐
|
||||
│ Upload │
|
||||
└──────────┘
|
||||
```
|
||||
|
||||
### Client-Side Batching
|
||||
|
||||
```lua
|
||||
-- TelemetryManager on device
|
||||
local events = {}
|
||||
local last_flush = os.time()
|
||||
|
||||
function track(event_type, data)
|
||||
if not telemetry_enabled then return end
|
||||
|
||||
table.insert(events, {
|
||||
type = event_type,
|
||||
timestamp = os.date("!%Y-%m-%dT%H:%M:%SZ"),
|
||||
data = data or {}
|
||||
})
|
||||
|
||||
-- Flush if batch is large or time elapsed
|
||||
if #events >= 50 or (os.time() - last_flush) > 60 then
|
||||
flush()
|
||||
end
|
||||
end
|
||||
|
||||
function flush()
|
||||
if #events == 0 then return end
|
||||
|
||||
local payload = {
|
||||
app_id = APP_ID,
|
||||
app_version = APP_VERSION,
|
||||
device_id = HASHED_DEVICE_ID,
|
||||
events = events
|
||||
}
|
||||
|
||||
-- Async HTTP POST
|
||||
http.post(TELEMETRY_URL, json.encode(payload))
|
||||
|
||||
events = {}
|
||||
last_flush = os.time()
|
||||
end
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Storage Options
|
||||
|
||||
### Option A: PostgreSQL + TimescaleDB
|
||||
|
||||
```sql
|
||||
-- Hypertable for time-series data
|
||||
CREATE TABLE telemetry_events (
|
||||
time TIMESTAMPTZ NOT NULL,
|
||||
app_id TEXT NOT NULL,
|
||||
device_id TEXT NOT NULL,
|
||||
session_id TEXT,
|
||||
event_type TEXT NOT NULL,
|
||||
event_data JSONB,
|
||||
app_version TEXT,
|
||||
mosis_version TEXT
|
||||
);
|
||||
|
||||
SELECT create_hypertable('telemetry_events', 'time');
|
||||
|
||||
-- Continuous aggregate for daily stats
|
||||
CREATE MATERIALIZED VIEW daily_stats
|
||||
WITH (timescaledb.continuous) AS
|
||||
SELECT
|
||||
time_bucket('1 day', time) AS day,
|
||||
app_id,
|
||||
event_type,
|
||||
COUNT(*) as count,
|
||||
COUNT(DISTINCT device_id) as unique_devices
|
||||
FROM telemetry_events
|
||||
GROUP BY day, app_id, event_type;
|
||||
```
|
||||
|
||||
### Option B: ClickHouse
|
||||
|
||||
```sql
|
||||
CREATE TABLE telemetry_events (
|
||||
timestamp DateTime,
|
||||
app_id String,
|
||||
device_id String,
|
||||
session_id String,
|
||||
event_type String,
|
||||
event_data String, -- JSON
|
||||
app_version String,
|
||||
mosis_version String
|
||||
) ENGINE = MergeTree()
|
||||
PARTITION BY toYYYYMM(timestamp)
|
||||
ORDER BY (app_id, timestamp);
|
||||
```
|
||||
|
||||
### Option C: Custom + PostgreSQL
|
||||
|
||||
```
|
||||
Raw events → Write to append-only log
|
||||
Aggregator → Process hourly → Write to PostgreSQL
|
||||
Cleanup → Delete raw after 24h
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Aggregation
|
||||
|
||||
### Pre-computed Metrics
|
||||
|
||||
| Metric | Granularity | Retention |
|
||||
|--------|-------------|-----------|
|
||||
| Daily active users | Day | 2 years |
|
||||
| Event counts | Day | 1 year |
|
||||
| Crash counts | Day | 1 year |
|
||||
| Session duration | Day | 90 days |
|
||||
| Performance percentiles | Day | 90 days |
|
||||
|
||||
### Aggregation Queries
|
||||
|
||||
```sql
|
||||
-- Daily active users
|
||||
SELECT
|
||||
DATE_TRUNC('day', time) as day,
|
||||
COUNT(DISTINCT device_id) as dau
|
||||
FROM telemetry_events
|
||||
WHERE app_id = $1
|
||||
AND event_type = 'app_start'
|
||||
AND time > NOW() - INTERVAL '30 days'
|
||||
GROUP BY day
|
||||
ORDER BY day;
|
||||
|
||||
-- Crash rate by version
|
||||
SELECT
|
||||
app_version,
|
||||
COUNT(*) FILTER (WHERE event_type = 'app_crash') as crashes,
|
||||
COUNT(*) FILTER (WHERE event_type = 'app_start') as starts,
|
||||
ROUND(
|
||||
100.0 * COUNT(*) FILTER (WHERE event_type = 'app_crash') /
|
||||
NULLIF(COUNT(*) FILTER (WHERE event_type = 'app_start'), 0),
|
||||
2
|
||||
) as crash_rate
|
||||
FROM telemetry_events
|
||||
WHERE app_id = $1
|
||||
AND time > NOW() - INTERVAL '7 days'
|
||||
GROUP BY app_version;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Crash Grouping
|
||||
|
||||
### Stack Trace Fingerprinting
|
||||
|
||||
```go
|
||||
func fingerprintCrash(crash CrashReport) string {
|
||||
// Normalize stack trace
|
||||
normalized := normalizeStackTrace(crash.StackTrace)
|
||||
|
||||
// Hash key components
|
||||
key := fmt.Sprintf("%s:%s:%s",
|
||||
crash.CrashType,
|
||||
crash.Message,
|
||||
normalized,
|
||||
)
|
||||
|
||||
return sha256(key)[:16]
|
||||
}
|
||||
|
||||
func normalizeStackTrace(stack string) string {
|
||||
// Remove line numbers (they change with code updates)
|
||||
// Remove memory addresses
|
||||
// Keep function names and file names
|
||||
re := regexp.MustCompile(`:\d+:`)
|
||||
return re.ReplaceAllString(stack, ":?:")
|
||||
}
|
||||
```
|
||||
|
||||
### Crash Groups Table
|
||||
|
||||
```sql
|
||||
CREATE TABLE crash_groups (
|
||||
id UUID PRIMARY KEY,
|
||||
app_id TEXT NOT NULL,
|
||||
fingerprint TEXT NOT NULL,
|
||||
crash_type TEXT NOT NULL,
|
||||
message TEXT,
|
||||
sample_stack_trace TEXT,
|
||||
first_seen TIMESTAMPTZ NOT NULL,
|
||||
last_seen TIMESTAMPTZ NOT NULL,
|
||||
occurrence_count INT DEFAULT 1,
|
||||
affected_versions TEXT[],
|
||||
status TEXT DEFAULT 'open', -- open, resolved, ignored
|
||||
UNIQUE(app_id, fingerprint)
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Developer Dashboard
|
||||
|
||||
### Metrics View
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Analytics - My Calculator │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Date Range: [Last 30 days ▼] │
|
||||
│ │
|
||||
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ Daily Users │ │ Crashes │ │ Crash-free │ │
|
||||
│ │ 1,234 │ │ 23 │ │ 98.1% │ │
|
||||
│ │ ▲ +12% │ │ ▼ -45% │ │ ▲ +2% │ │
|
||||
│ └─────────────┘ └─────────────┘ └─────────────┘ │
|
||||
│ │
|
||||
│ ┌────────────────────────────────────────────────────┐ │
|
||||
│ │ Daily Active Users │ │
|
||||
│ │ [Line chart showing DAU over time] │ │
|
||||
│ └────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
│ ┌────────────────────────────────────────────────────┐ │
|
||||
│ │ Version Distribution │ │
|
||||
│ │ [Pie chart: v1.2.0: 60%, v1.1.0: 30%, v1.0.0: 10%]│ │
|
||||
│ └────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Crashes View
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ Crashes - My Calculator │
|
||||
├─────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ Filter: [All versions ▼] [Open ▼] │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────────────────┐ │
|
||||
│ │ ● attempt to index nil value 'user' │ │
|
||||
│ │ lua_error • 156 occurrences • v1.2.0 │ │
|
||||
│ │ First: Jan 10 • Last: Jan 15 │ │
|
||||
│ │ [View] │ │
|
||||
│ ├──────────────────────────────────────────────────────┤ │
|
||||
│ │ ● memory limit exceeded │ │
|
||||
│ │ sandbox_error • 23 occurrences • v1.1.0, v1.2.0 │ │
|
||||
│ │ First: Jan 5 • Last: Jan 14 │ │
|
||||
│ │ [View] │ │
|
||||
│ └──────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## API Endpoints
|
||||
|
||||
```yaml
|
||||
# Ingestion (from devices)
|
||||
POST /v1/telemetry/events:
|
||||
auth: device_token or api_key
|
||||
body: { app_id, device_id, events[] }
|
||||
response: { received: number }
|
||||
|
||||
POST /v1/telemetry/crash:
|
||||
auth: device_token or api_key
|
||||
body: { app_id, device_id, crash }
|
||||
response: { id: string }
|
||||
|
||||
# Dashboard (for developers)
|
||||
GET /v1/apps/:id/analytics/overview:
|
||||
auth: required
|
||||
query: { start_date, end_date }
|
||||
response: { dau, crashes, crash_free_rate, ... }
|
||||
|
||||
GET /v1/apps/:id/analytics/events:
|
||||
auth: required
|
||||
query: { start_date, end_date, event_type }
|
||||
response: { data: [{ date, count, unique_devices }] }
|
||||
|
||||
GET /v1/apps/:id/crashes:
|
||||
auth: required
|
||||
query: { version, status, page, limit }
|
||||
response: { crashes: CrashGroup[], total }
|
||||
|
||||
GET /v1/apps/:id/crashes/:fingerprint:
|
||||
auth: required
|
||||
response: { crash_group, recent_occurrences[] }
|
||||
|
||||
PATCH /v1/apps/:id/crashes/:fingerprint:
|
||||
auth: required
|
||||
body: { status: 'resolved' | 'ignored' }
|
||||
response: { crash_group }
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Data Retention
|
||||
|
||||
| Data Type | Retention | Reason |
|
||||
|-----------|-----------|--------|
|
||||
| Raw events | 7 days | Debugging |
|
||||
| Daily aggregates | 2 years | Trends |
|
||||
| Crash reports | 90 days | Investigation |
|
||||
| Crash groups | Forever | Issue tracking |
|
||||
|
||||
### Cleanup Job
|
||||
|
||||
```sql
|
||||
-- Run daily
|
||||
DELETE FROM telemetry_events
|
||||
WHERE time < NOW() - INTERVAL '7 days';
|
||||
|
||||
DELETE FROM crash_reports
|
||||
WHERE timestamp < NOW() - INTERVAL '90 days';
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Privacy Controls
|
||||
|
||||
### User Settings
|
||||
|
||||
```
|
||||
Settings > Privacy > Analytics
|
||||
├── [✓] Send crash reports (helps developers fix bugs)
|
||||
├── [ ] Send usage analytics (how you use apps)
|
||||
└── [Request Data Deletion]
|
||||
```
|
||||
|
||||
### GDPR Endpoints
|
||||
|
||||
```yaml
|
||||
# User requests their data
|
||||
GET /v1/privacy/export:
|
||||
auth: user_token
|
||||
response: { download_url } # JSON export of all data
|
||||
|
||||
# User requests deletion
|
||||
DELETE /v1/privacy/data:
|
||||
auth: user_token
|
||||
response: { status: 'scheduled' } # Delete within 30 days
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Deliverables
|
||||
|
||||
- [ ] Event schema specification
|
||||
- [ ] Client-side SDK for batching
|
||||
- [ ] Ingestion API endpoints
|
||||
- [ ] Storage setup (TimescaleDB or ClickHouse)
|
||||
- [ ] Aggregation jobs
|
||||
- [ ] Crash grouping logic
|
||||
- [ ] Developer dashboard
|
||||
- [ ] Privacy controls
|
||||
- [ ] Data retention automation
|
||||
- [ ] GDPR export/delete
|
||||
|
||||
---
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. Real-time crash alerts (email/Slack)?
|
||||
2. Sampling for high-volume apps?
|
||||
3. Custom events API for developers?
|
||||
4. Benchmarks/comparisons with similar apps?
|
||||
|
||||
---
|
||||
|
||||
## References
|
||||
|
||||
- [GDPR Requirements](https://gdpr.eu/)
|
||||
- [TimescaleDB Best Practices](https://docs.timescale.com/timescaledb/latest/)
|
||||
- [Sentry Crash Grouping](https://docs.sentry.io/product/data-management-settings/event-grouping/)
|
||||
Reference in New Issue
Block a user