Files
gohoarder/migrations/001_create_schema_v2_postgres.sql
T
lukaszraczylo c0061b99e3 chore(schema): migrate to GORM V2 with multi-database support
- [x] Implement GORM V2 metadata store with SQLite, PostgreSQL, and MySQL support
- [x] Add database migration system using gormigrate for schema versioning
- [x] Create migration CLI tool with support for migrate, rollback, and status commands
- [x] Add Docker support for migration container (Dockerfile.migrate)
- [x] Implement automatic partition management for PostgreSQL time-series tables
- [x] Add background aggregation worker for download statistics
- [x] Support connection pooling configuration (max_open_conns, max_idle_conns, conn_max_lifetime)
- [x] Add blocking mechanism based on vulnerability thresholds in stats and handlers
- [x] Update Helm charts with migration init containers and multi-database configuration
- [x] Replace deprecated SQLite store with optimized GORM implementation
- [x] Add comprehensive integration tests for MySQL and PostgreSQL
- [x] Update frontend to display blocked packages and storage utilization
- [x] Add goreleaser configuration for migrate binary and container image
- [x] Update configuration examples with database backend options and recommendations
2026-01-03 20:44:23 +00:00

471 lines
20 KiB
PL/PgSQL

-- GoHoarder Database Schema V2 - PostgreSQL
-- Optimized for multi-user production deployments
-- Created: 2026-01-03
-- Enable required extensions
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
-- ============================================================================
-- TABLE: registries
-- Purpose: Normalized registry data (eliminates repeated strings)
-- ============================================================================
CREATE TABLE IF NOT EXISTS registries (
id SERIAL PRIMARY KEY,
name VARCHAR(50) UNIQUE NOT NULL,
display_name VARCHAR(100) NOT NULL,
upstream_url VARCHAR(512) NOT NULL,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
scan_by_default BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
CREATE INDEX idx_registry_name ON registries(name) WHERE deleted_at IS NULL;
CREATE INDEX idx_registry_enabled ON registries(enabled) WHERE enabled = TRUE AND deleted_at IS NULL;
COMMENT ON TABLE registries IS 'Normalized registry data (npm, pypi, go)';
COMMENT ON COLUMN registries.name IS 'Short name: npm, pypi, go';
COMMENT ON COLUMN registries.display_name IS 'Human-readable name: NPM Registry, PyPI';
-- ============================================================================
-- TABLE: packages
-- Purpose: Core package metadata with denormalized counts for performance
-- ============================================================================
CREATE TABLE IF NOT EXISTS packages (
id BIGSERIAL PRIMARY KEY,
registry_id INTEGER NOT NULL REFERENCES registries(id) ON DELETE RESTRICT,
name VARCHAR(255) NOT NULL,
version VARCHAR(100) NOT NULL,
-- Storage information
storage_key VARCHAR(512) UNIQUE NOT NULL,
size BIGINT NOT NULL,
checksum_md5 VARCHAR(32),
checksum_sha256 VARCHAR(64),
upstream_url VARCHAR(1024),
-- Cache management
cached_at TIMESTAMP NOT NULL DEFAULT NOW(),
last_accessed TIMESTAMP NOT NULL DEFAULT NOW(),
expires_at TIMESTAMP,
access_count BIGINT NOT NULL DEFAULT 0,
-- Security (denormalized for performance)
security_scanned BOOLEAN NOT NULL DEFAULT FALSE,
last_scanned_at TIMESTAMP,
vulnerability_count INTEGER NOT NULL DEFAULT 0,
highest_severity VARCHAR(20), -- critical, high, medium, low, none
-- Authentication
requires_auth BOOLEAN NOT NULL DEFAULT FALSE,
auth_provider VARCHAR(50),
-- Audit trail
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
-- Composite indexes for common queries
CREATE UNIQUE INDEX idx_package_registry_name_version
ON packages(registry_id, name, version) WHERE deleted_at IS NULL;
CREATE INDEX idx_package_storage_key ON packages(storage_key);
CREATE INDEX idx_package_name ON packages(name text_pattern_ops) WHERE deleted_at IS NULL;
CREATE INDEX idx_package_last_accessed ON packages(last_accessed DESC) WHERE deleted_at IS NULL;
CREATE INDEX idx_package_expires_at ON packages(expires_at) WHERE expires_at IS NOT NULL AND deleted_at IS NULL;
CREATE INDEX idx_package_access_count ON packages(access_count DESC) WHERE deleted_at IS NULL;
CREATE INDEX idx_package_size ON packages(size DESC);
-- Partial indexes for security queries
CREATE INDEX idx_package_vuln_count ON packages(vulnerability_count) WHERE vulnerability_count > 0 AND deleted_at IS NULL;
CREATE INDEX idx_package_severity ON packages(highest_severity) WHERE highest_severity IN ('critical', 'high') AND deleted_at IS NULL;
CREATE INDEX idx_package_security_scanned ON packages(security_scanned) WHERE deleted_at IS NULL;
COMMENT ON TABLE packages IS 'Core package metadata (optimized V2 schema)';
COMMENT ON COLUMN packages.access_count IS 'Total downloads (denormalized from stats)';
COMMENT ON COLUMN packages.vulnerability_count IS 'Number of vulnerabilities (denormalized)';
-- ============================================================================
-- TABLE: package_metadata
-- Purpose: Structured metadata (1:1 with packages, reduces main table size)
-- ============================================================================
CREATE TABLE IF NOT EXISTS package_metadata (
package_id BIGINT PRIMARY KEY REFERENCES packages(id) ON DELETE CASCADE,
author VARCHAR(255),
license VARCHAR(100),
homepage VARCHAR(512),
repository VARCHAR(512),
description TEXT,
keywords JSONB, -- Array of keywords
raw_metadata JSONB, -- Full metadata
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
CREATE INDEX idx_metadata_author ON package_metadata(author);
CREATE INDEX idx_metadata_license ON package_metadata(license);
CREATE INDEX idx_metadata_keywords ON package_metadata USING GIN(keywords);
CREATE INDEX idx_metadata_raw ON package_metadata USING GIN(raw_metadata);
COMMENT ON TABLE package_metadata IS 'Structured package metadata (separated for performance)';
-- ============================================================================
-- TABLE: vulnerabilities
-- Purpose: Normalized vulnerability data (each CVE stored once)
-- ============================================================================
CREATE TABLE IF NOT EXISTS vulnerabilities (
id BIGSERIAL PRIMARY KEY,
cve_id VARCHAR(50) UNIQUE NOT NULL,
title VARCHAR(512) NOT NULL,
description TEXT,
severity VARCHAR(20) NOT NULL, -- critical, high, medium, low
cvss REAL,
published_at TIMESTAMP NOT NULL,
fixed_version VARCHAR(100),
references JSONB, -- Array of URLs
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
CREATE UNIQUE INDEX idx_vuln_cve_id ON vulnerabilities(cve_id);
CREATE INDEX idx_vuln_severity ON vulnerabilities(severity);
CREATE INDEX idx_vuln_cvss ON vulnerabilities(cvss DESC NULLS LAST);
CREATE INDEX idx_vuln_published ON vulnerabilities(published_at DESC);
COMMENT ON TABLE vulnerabilities IS 'Normalized vulnerability data (99% storage reduction)';
-- ============================================================================
-- TABLE: package_vulnerabilities
-- Purpose: Many-to-many relationship between packages and vulnerabilities
-- ============================================================================
CREATE TABLE IF NOT EXISTS package_vulnerabilities (
id BIGSERIAL PRIMARY KEY,
package_id BIGINT NOT NULL REFERENCES packages(id) ON DELETE CASCADE,
vulnerability_id BIGINT NOT NULL REFERENCES vulnerabilities(id) ON DELETE CASCADE,
scanner VARCHAR(50) NOT NULL,
detected_at TIMESTAMP NOT NULL DEFAULT NOW(),
bypassed BOOLEAN NOT NULL DEFAULT FALSE,
bypass_id BIGINT, -- References cve_bypasses.id (soft reference)
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
CREATE INDEX idx_pkg_vuln_package ON package_vulnerabilities(package_id) WHERE deleted_at IS NULL;
CREATE INDEX idx_pkg_vuln_vuln ON package_vulnerabilities(vulnerability_id) WHERE deleted_at IS NULL;
CREATE INDEX idx_pkg_vuln_composite ON package_vulnerabilities(package_id, vulnerability_id) WHERE deleted_at IS NULL;
CREATE INDEX idx_pkg_vuln_scanner ON package_vulnerabilities(scanner);
CREATE INDEX idx_pkg_vuln_bypassed ON package_vulnerabilities(bypassed) WHERE bypassed = FALSE;
-- ============================================================================
-- TABLE: scan_results
-- Purpose: Security scan results with severity breakdown
-- ============================================================================
CREATE TABLE IF NOT EXISTS scan_results (
id BIGSERIAL PRIMARY KEY,
package_id BIGINT NOT NULL REFERENCES packages(id) ON DELETE CASCADE,
scanner VARCHAR(50) NOT NULL,
scanned_at TIMESTAMP NOT NULL DEFAULT NOW(),
status VARCHAR(20) NOT NULL, -- success, failed, pending
vuln_count INTEGER NOT NULL DEFAULT 0,
critical_count INTEGER NOT NULL DEFAULT 0,
high_count INTEGER NOT NULL DEFAULT 0,
medium_count INTEGER NOT NULL DEFAULT 0,
low_count INTEGER NOT NULL DEFAULT 0,
scan_duration INTEGER NOT NULL DEFAULT 0, -- milliseconds
details JSONB,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
CREATE INDEX idx_scan_package_scanner ON scan_results(package_id, scanner) WHERE deleted_at IS NULL;
CREATE INDEX idx_scan_scanned_at ON scan_results(scanned_at DESC);
CREATE INDEX idx_scan_status ON scan_results(status);
CREATE INDEX idx_scan_vuln_count ON scan_results(vuln_count) WHERE vuln_count > 0;
COMMENT ON TABLE scan_results IS 'Security scan results (optimized V2)';
-- ============================================================================
-- TABLE: cve_bypasses
-- Purpose: CVE bypass rules with usage tracking
-- ============================================================================
CREATE TABLE IF NOT EXISTS cve_bypasses (
id BIGSERIAL PRIMARY KEY,
type VARCHAR(20) NOT NULL, -- cve, package, registry
target VARCHAR(512) NOT NULL,
reason TEXT NOT NULL,
created_by VARCHAR(255) NOT NULL,
expires_at TIMESTAMP NOT NULL,
notify_on_expiry BOOLEAN NOT NULL DEFAULT FALSE,
active BOOLEAN NOT NULL DEFAULT TRUE,
usage_count BIGINT NOT NULL DEFAULT 0,
last_used_at TIMESTAMP,
registry_id INTEGER REFERENCES registries(id),
package_id BIGINT REFERENCES packages(id),
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW(),
deleted_at TIMESTAMP
);
CREATE INDEX idx_bypass_type ON cve_bypasses(type);
CREATE INDEX idx_bypass_target ON cve_bypasses(target);
CREATE INDEX idx_bypass_active ON cve_bypasses(active) WHERE active = TRUE AND deleted_at IS NULL;
CREATE INDEX idx_bypass_expires_at ON cve_bypasses(expires_at) WHERE active = TRUE;
CREATE INDEX idx_bypass_created_by ON cve_bypasses(created_by);
COMMENT ON TABLE cve_bypasses IS 'CVE bypass rules with scope limiting';
-- ============================================================================
-- PARTITIONED TABLE: download_events
-- Purpose: High-volume time-series data (partitioned by month)
-- ============================================================================
CREATE TABLE IF NOT EXISTS download_events (
id BIGSERIAL,
package_id BIGINT NOT NULL,
registry_id INTEGER NOT NULL,
downloaded_at TIMESTAMP NOT NULL,
user_agent VARCHAR(512),
ip_address VARCHAR(45),
authenticated BOOLEAN NOT NULL DEFAULT FALSE,
username VARCHAR(255)
) PARTITION BY RANGE (downloaded_at);
CREATE INDEX idx_download_events_package ON download_events(package_id, downloaded_at);
CREATE INDEX idx_download_events_registry ON download_events(registry_id);
CREATE INDEX idx_download_events_time ON download_events(downloaded_at);
COMMENT ON TABLE download_events IS 'Download events (partitioned by month for performance)';
-- Create partitions for current month ± 2 months
DO $$
DECLARE
start_date DATE;
end_date DATE;
partition_name TEXT;
i INTEGER;
BEGIN
FOR i IN -2..2 LOOP
start_date := date_trunc('month', NOW() + (i || ' months')::INTERVAL)::DATE;
end_date := (start_date + INTERVAL '1 month')::DATE;
partition_name := 'download_events_' || to_char(start_date, 'YYYY_MM');
EXECUTE format('CREATE TABLE IF NOT EXISTS %I PARTITION OF download_events FOR VALUES FROM (%L) TO (%L)',
partition_name, start_date, end_date);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(package_id, downloaded_at)',
partition_name || '_package_idx', partition_name);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(registry_id)',
partition_name || '_registry_idx', partition_name);
END LOOP;
END $$;
-- ============================================================================
-- TABLE: download_stats_hourly
-- Purpose: Pre-aggregated hourly statistics (1000x faster queries)
-- ============================================================================
CREATE TABLE IF NOT EXISTS download_stats_hourly (
id BIGSERIAL PRIMARY KEY,
registry_id INTEGER NOT NULL REFERENCES registries(id),
package_id BIGINT REFERENCES packages(id), -- NULL = all packages in registry
time_bucket TIMESTAMP NOT NULL,
download_count BIGINT NOT NULL DEFAULT 0,
unique_ips BIGINT NOT NULL DEFAULT 0,
auth_downloads BIGINT NOT NULL DEFAULT 0,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX idx_stats_hourly_composite
ON download_stats_hourly(registry_id, COALESCE(package_id, 0), time_bucket);
CREATE INDEX idx_stats_hourly_time ON download_stats_hourly(time_bucket DESC);
COMMENT ON TABLE download_stats_hourly IS 'Hourly aggregated stats (pre-computed)';
-- ============================================================================
-- TABLE: download_stats_daily
-- Purpose: Pre-aggregated daily statistics with analytics
-- ============================================================================
CREATE TABLE IF NOT EXISTS download_stats_daily (
id BIGSERIAL PRIMARY KEY,
registry_id INTEGER NOT NULL REFERENCES registries(id),
package_id BIGINT REFERENCES packages(id),
time_bucket TIMESTAMP NOT NULL,
download_count BIGINT NOT NULL DEFAULT 0,
unique_ips BIGINT NOT NULL DEFAULT 0,
auth_downloads BIGINT NOT NULL DEFAULT 0,
top_user_agents JSONB,
created_at TIMESTAMP NOT NULL DEFAULT NOW(),
updated_at TIMESTAMP NOT NULL DEFAULT NOW()
);
CREATE UNIQUE INDEX idx_stats_daily_composite
ON download_stats_daily(registry_id, COALESCE(package_id, 0), time_bucket);
CREATE INDEX idx_stats_daily_time ON download_stats_daily(time_bucket DESC);
COMMENT ON TABLE download_stats_daily IS 'Daily aggregated stats with analytics';
-- ============================================================================
-- PARTITIONED TABLE: audit_log
-- Purpose: Audit trail for compliance (partitioned by month)
-- ============================================================================
CREATE TABLE IF NOT EXISTS audit_log (
id BIGSERIAL,
entity_type VARCHAR(50) NOT NULL,
entity_id BIGINT NOT NULL,
action VARCHAR(20) NOT NULL, -- create, update, delete
username VARCHAR(255) NOT NULL,
timestamp TIMESTAMP NOT NULL DEFAULT NOW(),
changes JSONB,
ip_address VARCHAR(45),
user_agent VARCHAR(512)
) PARTITION BY RANGE (timestamp);
CREATE INDEX idx_audit_log_entity ON audit_log(entity_type, entity_id);
CREATE INDEX idx_audit_log_username ON audit_log(username);
CREATE INDEX idx_audit_log_timestamp ON audit_log(timestamp DESC);
COMMENT ON TABLE audit_log IS 'Audit trail for compliance and debugging';
-- Create audit_log partitions
DO $$
DECLARE
start_date DATE;
end_date DATE;
partition_name TEXT;
i INTEGER;
BEGIN
FOR i IN -1..2 LOOP
start_date := date_trunc('month', NOW() + (i || ' months')::INTERVAL)::DATE;
end_date := (start_date + INTERVAL '1 month')::DATE;
partition_name := 'audit_log_' || to_char(start_date, 'YYYY_MM');
EXECUTE format('CREATE TABLE IF NOT EXISTS %I PARTITION OF audit_log FOR VALUES FROM (%L) TO (%L)',
partition_name, start_date, end_date);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(entity_type, entity_id)',
partition_name || '_entity_idx', partition_name);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(username)',
partition_name || '_user_idx', partition_name);
END LOOP;
END $$;
-- ============================================================================
-- FUNCTIONS: Automatic partition creation
-- ============================================================================
CREATE OR REPLACE FUNCTION create_next_month_partitions()
RETURNS void AS $$
DECLARE
next_month DATE := date_trunc('month', NOW() + INTERVAL '2 months');
partition_name TEXT;
start_date TEXT;
end_date TEXT;
BEGIN
-- Download events partition
partition_name := 'download_events_' || to_char(next_month, 'YYYY_MM');
start_date := to_char(next_month, 'YYYY-MM-DD');
end_date := to_char(next_month + INTERVAL '1 month', 'YYYY-MM-DD');
EXECUTE format('CREATE TABLE IF NOT EXISTS %I PARTITION OF download_events FOR VALUES FROM (%L) TO (%L)',
partition_name, start_date, end_date);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(package_id, downloaded_at)',
partition_name || '_package_idx', partition_name);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(registry_id)',
partition_name || '_registry_idx', partition_name);
-- Audit log partition
partition_name := 'audit_log_' || to_char(next_month, 'YYYY_MM');
EXECUTE format('CREATE TABLE IF NOT EXISTS %I PARTITION OF audit_log FOR VALUES FROM (%L) TO (%L)',
partition_name, start_date, end_date);
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I(entity_type, entity_id)',
partition_name || '_entity_idx', partition_name);
RAISE NOTICE 'Created partitions for %', to_char(next_month, 'YYYY-MM');
END;
$$ LANGUAGE plpgsql;
COMMENT ON FUNCTION create_next_month_partitions() IS 'Auto-create partitions for next month';
-- ============================================================================
-- TRIGGERS: Updated_at timestamp
-- ============================================================================
CREATE OR REPLACE FUNCTION update_updated_at_column()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER update_registries_updated_at BEFORE UPDATE ON registries
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER update_packages_updated_at BEFORE UPDATE ON packages
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
CREATE TRIGGER update_package_metadata_updated_at BEFORE UPDATE ON package_metadata
FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
-- ============================================================================
-- SEED DATA: Default registries
-- ============================================================================
INSERT INTO registries (name, display_name, upstream_url, enabled, scan_by_default) VALUES
('npm', 'NPM Registry', 'https://registry.npmjs.org', TRUE, TRUE),
('pypi', 'PyPI', 'https://pypi.org', TRUE, TRUE),
('go', 'Go Modules', 'https://proxy.golang.org', TRUE, TRUE)
ON CONFLICT (name) DO NOTHING;
-- ============================================================================
-- VIEWS: Convenience views for common queries
-- ============================================================================
CREATE OR REPLACE VIEW v_vulnerable_packages AS
SELECT
r.name AS registry,
p.name,
p.version,
p.vulnerability_count,
p.highest_severity,
p.last_scanned_at
FROM packages p
JOIN registries r ON p.registry_id = r.id
WHERE p.vulnerability_count > 0 AND p.deleted_at IS NULL
ORDER BY
CASE p.highest_severity
WHEN 'critical' THEN 1
WHEN 'high' THEN 2
WHEN 'medium' THEN 3
WHEN 'low' THEN 4
ELSE 5
END,
p.vulnerability_count DESC;
COMMENT ON VIEW v_vulnerable_packages IS 'All packages with vulnerabilities (sorted by severity)';
-- ============================================================================
-- COMPLETE
-- ============================================================================
SELECT 'Schema V2 created successfully!' AS status;