Files
claude-mnemonic/scripts/download-bge-model.sh
T
lukaszraczylo f79782a008 Release dec 2025 (#15)
* Resolves issue #13

- Switched model to bge-small-en-v1.5
- Added lazy re-embedding
- Added model version tracking per vector
- Added conversion of vectors to the new model

* Add lfs support to the workflow.

* Implements importance scoring with decay + voting #6

* Resolves issue #5 by marking observations as superseeded and scheduled for deletion

* Implement pattern detection #7

* Improve injections and observations accuracy

- Session start: Recent observations for project context (recency-based)
- User prompt: Semantically relevant observations (similarity-based with threshold)

* Added two stage retrieval with bi and cross encoder #8

* Implement query expansion and reformulation #9

* Knowledge graph and relationships ( resolves #4 )

- File Overlap Detection: Detects relationships when observations modify/read the same files
- Concept Overlap Detection: Detects relationships based on shared semantic concepts
- Type Progression Detection: Infers relationships from natural observation type progressions (e.g., discovery → bugfix = "fixes")
- Temporal Proximity Detection: Detects relationships between observations in the same session within 5 minutes
- Narrative Mention Detection: Detects explicit relationship language in narratives (e.g., "fixes", "depends on", "supersedes")

* Add visualisation of the relations to the dashboard.

* fixup! Add visualisation of the relations to the dashboard.

* Update documentation with new settings and screenshots.
2025-12-19 17:57:11 +00:00

122 lines
3.3 KiB
Bash
Executable File

#!/bin/bash
# Download BGE-small-en-v1.5 model for embedding
# Usage: ./download-bge-model.sh [--force]
# Use --force to re-download even if files exist
set -e
MODEL_NAME="bge-small-en-v1.5"
MODEL_REPO="BAAI/bge-small-en-v1.5"
ASSETS_DIR="internal/embedding/assets"
VERSION_FILE="${ASSETS_DIR}/.model_version"
FORCE_DOWNLOAD=false
# Check for --force flag
for arg in "$@"; do
if [ "$arg" = "--force" ]; then
FORCE_DOWNLOAD=true
fi
done
# Temporary directory for downloads
TEMP_DIR=$(mktemp -d)
trap "rm -rf ${TEMP_DIR}" EXIT
# Check if model already exists
model_exists() {
[ -f "${ASSETS_DIR}/model.onnx" ] && [ -f "${ASSETS_DIR}/tokenizer.json" ]
}
# Get installed version
get_installed_version() {
if [ -f "$VERSION_FILE" ]; then
cat "$VERSION_FILE"
else
echo ""
fi
}
# Write version file
write_version_file() {
echo "${MODEL_NAME}" > "$VERSION_FILE"
}
download_model() {
echo "Downloading ${MODEL_NAME} from Hugging Face..."
# Create assets directory
mkdir -p "${ASSETS_DIR}"
# Download ONNX model
# BGE models have ONNX exports available in the repo
echo "Downloading ONNX model..."
curl -fsSL \
"https://huggingface.co/${MODEL_REPO}/resolve/main/onnx/model.onnx" \
-o "${TEMP_DIR}/model.onnx"
# Download tokenizer.json
echo "Downloading tokenizer..."
curl -fsSL \
"https://huggingface.co/${MODEL_REPO}/resolve/main/tokenizer.json" \
-o "${TEMP_DIR}/tokenizer.json"
# Verify files exist and have content
if [ ! -s "${TEMP_DIR}/model.onnx" ]; then
echo "Error: Failed to download model.onnx or file is empty"
exit 1
fi
if [ ! -s "${TEMP_DIR}/tokenizer.json" ]; then
echo "Error: Failed to download tokenizer.json or file is empty"
exit 1
fi
# Move to assets directory (backup old files first)
if [ -f "${ASSETS_DIR}/model.onnx" ]; then
mv "${ASSETS_DIR}/model.onnx" "${ASSETS_DIR}/model.onnx.bak"
fi
if [ -f "${ASSETS_DIR}/tokenizer.json" ]; then
mv "${ASSETS_DIR}/tokenizer.json" "${ASSETS_DIR}/tokenizer.json.bak"
fi
mv "${TEMP_DIR}/model.onnx" "${ASSETS_DIR}/model.onnx"
mv "${TEMP_DIR}/tokenizer.json" "${ASSETS_DIR}/tokenizer.json"
# Remove backups on success
rm -f "${ASSETS_DIR}/model.onnx.bak" "${ASSETS_DIR}/tokenizer.json.bak"
# Write version file
write_version_file
echo "Model size: $(du -h "${ASSETS_DIR}/model.onnx" | cut -f1)"
echo "Tokenizer size: $(du -h "${ASSETS_DIR}/tokenizer.json" | cut -f1)"
}
echo "BGE Model Downloader - ${MODEL_NAME}"
echo "=================================="
need_download=false
reason=""
if [ "$FORCE_DOWNLOAD" = true ]; then
need_download=true
reason="forced"
elif ! model_exists; then
need_download=true
reason="not found"
elif [ "$(get_installed_version)" != "${MODEL_NAME}" ]; then
need_download=true
reason="version mismatch (installed: $(get_installed_version), required: ${MODEL_NAME})"
fi
if [ "$need_download" = true ]; then
if [ -n "$reason" ] && [ "$reason" != "not found" ]; then
echo "Re-downloading: ${reason}"
fi
download_model
echo "Done! ${MODEL_NAME} installed successfully."
else
echo "Model ${MODEL_NAME} already exists, skipping download."
echo "Use --force to re-download."
fi