mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-05 23:03:55 +00:00
f79782a008
* Resolves issue #13 - Switched model to bge-small-en-v1.5 - Added lazy re-embedding - Added model version tracking per vector - Added conversion of vectors to the new model * Add lfs support to the workflow. * Implements importance scoring with decay + voting #6 * Resolves issue #5 by marking observations as superseeded and scheduled for deletion * Implement pattern detection #7 * Improve injections and observations accuracy - Session start: Recent observations for project context (recency-based) - User prompt: Semantically relevant observations (similarity-based with threshold) * Added two stage retrieval with bi and cross encoder #8 * Implement query expansion and reformulation #9 * Knowledge graph and relationships ( resolves #4 ) - File Overlap Detection: Detects relationships when observations modify/read the same files - Concept Overlap Detection: Detects relationships based on shared semantic concepts - Type Progression Detection: Infers relationships from natural observation type progressions (e.g., discovery → bugfix = "fixes") - Temporal Proximity Detection: Detects relationships between observations in the same session within 5 minutes - Narrative Mention Detection: Detects explicit relationship language in narratives (e.g., "fixes", "depends on", "supersedes") * Add visualisation of the relations to the dashboard. * fixup! Add visualisation of the relations to the dashboard. * Update documentation with new settings and screenshots.
122 lines
3.3 KiB
Bash
Executable File
122 lines
3.3 KiB
Bash
Executable File
#!/bin/bash
|
|
# Download BGE-small-en-v1.5 model for embedding
|
|
# Usage: ./download-bge-model.sh [--force]
|
|
# Use --force to re-download even if files exist
|
|
|
|
set -e
|
|
|
|
MODEL_NAME="bge-small-en-v1.5"
|
|
MODEL_REPO="BAAI/bge-small-en-v1.5"
|
|
ASSETS_DIR="internal/embedding/assets"
|
|
VERSION_FILE="${ASSETS_DIR}/.model_version"
|
|
FORCE_DOWNLOAD=false
|
|
|
|
# Check for --force flag
|
|
for arg in "$@"; do
|
|
if [ "$arg" = "--force" ]; then
|
|
FORCE_DOWNLOAD=true
|
|
fi
|
|
done
|
|
|
|
# Temporary directory for downloads
|
|
TEMP_DIR=$(mktemp -d)
|
|
trap "rm -rf ${TEMP_DIR}" EXIT
|
|
|
|
# Check if model already exists
|
|
model_exists() {
|
|
[ -f "${ASSETS_DIR}/model.onnx" ] && [ -f "${ASSETS_DIR}/tokenizer.json" ]
|
|
}
|
|
|
|
# Get installed version
|
|
get_installed_version() {
|
|
if [ -f "$VERSION_FILE" ]; then
|
|
cat "$VERSION_FILE"
|
|
else
|
|
echo ""
|
|
fi
|
|
}
|
|
|
|
# Write version file
|
|
write_version_file() {
|
|
echo "${MODEL_NAME}" > "$VERSION_FILE"
|
|
}
|
|
|
|
download_model() {
|
|
echo "Downloading ${MODEL_NAME} from Hugging Face..."
|
|
|
|
# Create assets directory
|
|
mkdir -p "${ASSETS_DIR}"
|
|
|
|
# Download ONNX model
|
|
# BGE models have ONNX exports available in the repo
|
|
echo "Downloading ONNX model..."
|
|
curl -fsSL \
|
|
"https://huggingface.co/${MODEL_REPO}/resolve/main/onnx/model.onnx" \
|
|
-o "${TEMP_DIR}/model.onnx"
|
|
|
|
# Download tokenizer.json
|
|
echo "Downloading tokenizer..."
|
|
curl -fsSL \
|
|
"https://huggingface.co/${MODEL_REPO}/resolve/main/tokenizer.json" \
|
|
-o "${TEMP_DIR}/tokenizer.json"
|
|
|
|
# Verify files exist and have content
|
|
if [ ! -s "${TEMP_DIR}/model.onnx" ]; then
|
|
echo "Error: Failed to download model.onnx or file is empty"
|
|
exit 1
|
|
fi
|
|
|
|
if [ ! -s "${TEMP_DIR}/tokenizer.json" ]; then
|
|
echo "Error: Failed to download tokenizer.json or file is empty"
|
|
exit 1
|
|
fi
|
|
|
|
# Move to assets directory (backup old files first)
|
|
if [ -f "${ASSETS_DIR}/model.onnx" ]; then
|
|
mv "${ASSETS_DIR}/model.onnx" "${ASSETS_DIR}/model.onnx.bak"
|
|
fi
|
|
if [ -f "${ASSETS_DIR}/tokenizer.json" ]; then
|
|
mv "${ASSETS_DIR}/tokenizer.json" "${ASSETS_DIR}/tokenizer.json.bak"
|
|
fi
|
|
|
|
mv "${TEMP_DIR}/model.onnx" "${ASSETS_DIR}/model.onnx"
|
|
mv "${TEMP_DIR}/tokenizer.json" "${ASSETS_DIR}/tokenizer.json"
|
|
|
|
# Remove backups on success
|
|
rm -f "${ASSETS_DIR}/model.onnx.bak" "${ASSETS_DIR}/tokenizer.json.bak"
|
|
|
|
# Write version file
|
|
write_version_file
|
|
|
|
echo "Model size: $(du -h "${ASSETS_DIR}/model.onnx" | cut -f1)"
|
|
echo "Tokenizer size: $(du -h "${ASSETS_DIR}/tokenizer.json" | cut -f1)"
|
|
}
|
|
|
|
echo "BGE Model Downloader - ${MODEL_NAME}"
|
|
echo "=================================="
|
|
|
|
need_download=false
|
|
reason=""
|
|
|
|
if [ "$FORCE_DOWNLOAD" = true ]; then
|
|
need_download=true
|
|
reason="forced"
|
|
elif ! model_exists; then
|
|
need_download=true
|
|
reason="not found"
|
|
elif [ "$(get_installed_version)" != "${MODEL_NAME}" ]; then
|
|
need_download=true
|
|
reason="version mismatch (installed: $(get_installed_version), required: ${MODEL_NAME})"
|
|
fi
|
|
|
|
if [ "$need_download" = true ]; then
|
|
if [ -n "$reason" ] && [ "$reason" != "not found" ]; then
|
|
echo "Re-downloading: ${reason}"
|
|
fi
|
|
download_model
|
|
echo "Done! ${MODEL_NAME} installed successfully."
|
|
else
|
|
echo "Model ${MODEL_NAME} already exists, skipping download."
|
|
echo "Use --force to re-download."
|
|
fi
|