diff --git a/models-db.md b/models-db.md new file mode 100644 index 0000000..f2a459a --- /dev/null +++ b/models-db.md @@ -0,0 +1,298 @@ +# Models Database Implementation + +SQLite database (`models.db`) for local model metadata storage and CivitAI model information cache. + +## Dual Purpose + +1. **Metadata Storage** — Track local safetensor files with their SHA256 hashes and link them to CivitAI model info +2. **Information Source** — Cache CivitAI API responses for offline queries, search, and model discovery + +## Schema Overview + +### Local Files Tracking + +```sql +-- Your local safetensor files +local_files ( + id INTEGER PRIMARY KEY, + file_path TEXT NOT NULL UNIQUE, -- Absolute path to file + sha256 TEXT NOT NULL, -- Full SHA256 hash + header_size INTEGER, -- Safetensor header size in bytes + tensor_count INTEGER, -- Number of tensors in file + civitai_model_id INTEGER, -- Links to models.civitai_id + civitai_version_id INTEGER, -- Links to model_versions.civitai_id + created_at TEXT, + updated_at TEXT +) + +-- Key-value metadata extracted from safetensor headers +safetensor_metadata ( + id INTEGER PRIMARY KEY, + local_file_id INTEGER NOT NULL, -- FK to local_files.id + key TEXT NOT NULL, + value TEXT, + UNIQUE(local_file_id, key) +) +``` + +### CivitAI Model Cache + +```sql +-- Model creators +creators (id, username, image_url) + +-- Models from CivitAI +models ( + id INTEGER PRIMARY KEY, + civitai_id INTEGER UNIQUE NOT NULL, -- CivitAI model ID + name TEXT NOT NULL, + description TEXT, -- HTML description + type TEXT NOT NULL, -- Checkpoint, LORA, etc. + nsfw INTEGER, + creator_id INTEGER, -- FK to creators + download_count INTEGER, + thumbs_up_count INTEGER, + ... +) + +-- Model versions (each model has multiple versions) +model_versions ( + id INTEGER PRIMARY KEY, + civitai_id INTEGER UNIQUE NOT NULL, -- CivitAI version ID + model_id INTEGER NOT NULL, -- FK to models.id + name TEXT NOT NULL, + base_model TEXT, -- "SD 1.5", "SDXL 1.0", "Pony", etc. + download_url TEXT, + version_index INTEGER, -- 0 = latest + ... +) + +-- Trigger words for LoRAs +trained_words (version_id, word, position) + +-- Downloadable files for each version +version_files ( + civitai_id INTEGER UNIQUE, + version_id INTEGER, -- FK to model_versions + name TEXT, + size_kb REAL, + format TEXT, -- safetensors, ckpt, etc. + fp TEXT, -- fp16, fp32, bf16 + is_primary INTEGER, + download_url TEXT +) + +-- File hashes (SHA256, AutoV1, AutoV2, etc.) +file_hashes (file_id, hash_type, hash_value) +``` + +### Tags and Images + +```sql +tags (id, name) +model_tags (model_id, tag_id) + +-- Preview images with generation params +version_images (version_id, url, width, height, nsfw_level, ...) +image_generation_params (image_id, key, value) -- prompt, sampler, cfg, etc. +image_resources (image_id, name, type, hash, weight) -- LoRAs used in image +``` + +## Views + +```sql +-- Models with their latest version info +v_models_with_latest: + id, civitai_id, name, type, nsfw, creator, latest_version, base_model, download_count, thumbs_up_count + +-- Local files with linked CivitAI info +v_local_files_full: + file_path, sha256, model_name, model_type, version_name, base_model, creator +``` + +## Implementation Strategy + +### 1. Scan Command (`tsr scan`) + +Scan local model directories and populate `local_files`: + +```python +def scan_models(directory: Path, db: Connection) -> None: + """Scan directory for safetensor files and add to database.""" + for path in directory.rglob("*.safetensors"): + sha256 = compute_sha256(path) + header = read_safetensor_header(path) + + # Insert or update local_files + db.execute(""" + INSERT INTO local_files (file_path, sha256, header_size, tensor_count) + VALUES (?, ?, ?, ?) + ON CONFLICT(file_path) DO UPDATE SET + sha256 = excluded.sha256, + updated_at = datetime('now') + """, (str(path), sha256, header['size'], header['tensor_count'])) + + # Store metadata + for key, value in header['metadata'].items(): + db.execute(""" + INSERT INTO safetensor_metadata (local_file_id, key, value) + VALUES (?, ?, ?) + ON CONFLICT DO UPDATE SET value = excluded.value + """, (file_id, key, json.dumps(value))) +``` + +### 2. Link Command (`tsr link`) + +Match local files to CivitAI by hash lookup: + +```python +def link_to_civitai(db: Connection, api_key: str | None) -> None: + """Link local files to CivitAI models using hash matching.""" + unlinked = db.execute(""" + SELECT id, sha256 FROM local_files + WHERE civitai_model_id IS NULL + """).fetchall() + + for file_id, sha256 in unlinked: + # Check local hash cache first + version = db.execute(""" + SELECT mv.civitai_id, mv.model_id + FROM file_hashes fh + JOIN version_files vf ON fh.file_id = vf.id + JOIN model_versions mv ON vf.version_id = mv.id + WHERE fh.hash_value = ? AND fh.hash_type = 'SHA256' + """, (sha256,)).fetchone() + + if not version: + # Fall back to API lookup + data = fetch_civitai_by_hash(sha256, api_key) + if data: + store_model_version(db, data) + version = (data['id'], data['modelId']) + + if version: + db.execute(""" + UPDATE local_files + SET civitai_version_id = ?, civitai_model_id = ? + WHERE id = ? + """, (version[0], version[1], file_id)) +``` + +### 3. Cache Command (`tsr cache`) + +Fetch and store full model details from CivitAI: + +```python +def cache_model(model_id: int, db: Connection, api_key: str | None) -> None: + """Fetch and cache complete model data from CivitAI.""" + data = fetch_civitai_model(model_id, api_key) + if not data: + return + + # Upsert creator + creator = data.get('creator', {}) + if creator: + db.execute(""" + INSERT INTO creators (username, image_url) VALUES (?, ?) + ON CONFLICT(username) DO UPDATE SET image_url = excluded.image_url + """, (creator['username'], creator.get('image'))) + + # Upsert model + db.execute(""" + INSERT INTO models (civitai_id, name, description, type, nsfw, ...) + VALUES (?, ?, ?, ?, ?, ...) + ON CONFLICT(civitai_id) DO UPDATE SET ... + """, ...) + + # Process versions, files, hashes, images, trained words + for idx, version in enumerate(data.get('modelVersions', [])): + store_version(db, model_id, version, version_index=idx) +``` + +### 4. Query Commands + +**List local models with CivitAI info:** +```python +def list_local_models(db: Connection) -> list[dict]: + """List all local files with their linked CivitAI metadata.""" + return db.execute(""" + SELECT * FROM v_local_files_full ORDER BY model_name + """).fetchall() +``` + +**Search cached models:** +```python +def search_cached(query: str, model_type: str | None, db: Connection) -> list[dict]: + """Search cached models without hitting the API.""" + sql = """ + SELECT m.*, mv.base_model, mv.download_url + FROM models m + JOIN model_versions mv ON mv.model_id = m.id AND mv.version_index = 0 + WHERE m.name LIKE ? + """ + params = [f'%{query}%'] + + if model_type: + sql += " AND m.type = ?" + params.append(model_type) + + return db.execute(sql, params).fetchall() +``` + +**Find trigger words for a local LoRA:** +```python +def get_trigger_words(file_path: str, db: Connection) -> list[str]: + """Get trigger words for a local LoRA file.""" + return db.execute(""" + SELECT tw.word + FROM trained_words tw + JOIN model_versions mv ON tw.version_id = mv.id + JOIN local_files lf ON lf.civitai_version_id = mv.civitai_id + WHERE lf.file_path = ? + ORDER BY tw.position + """, (file_path,)).fetchall() +``` + +## Database Location + +Following XDG conventions, the database should live at: + +```python +from tensors.config import DATA_DIR + +DB_PATH = DATA_DIR / "models.db" # ~/.local/share/tensors/models.db +``` + +## CLI Integration + +```bash +# Scan models directory +tsr db scan /models/ + +# Link local files to CivitAI (uses API for unknown hashes) +tsr db link + +# Cache a specific model's full data +tsr db cache 999258 + +# List local models with CivitAI info +tsr db list + +# Search cached models (offline) +tsr db search "bimbo" --type lora + +# Show trigger words for a LoRA +tsr db triggers /models/loras/70s_VPMS.safetensors + +# Show generation params from example images +tsr db prompts 999258 +``` + +## Benefits + +1. **Offline First** — Query cached data without API calls +2. **Hash Deduplication** — Detect duplicate files by SHA256 +3. **Metadata Enrichment** — Combine safetensor header info with CivitAI metadata +4. **Trigger Word Lookup** — Find correct prompts for LoRAs +5. **Example Prompts** — Extract working prompts from preview images +6. **Version Tracking** — Know which version you have vs. latest available diff --git a/models.db b/models.db new file mode 100644 index 0000000..5db1b4e Binary files /dev/null and b/models.db differ