mirror of
https://github.com/lukaszraczylo/claude-mnemonic.git
synced 2026-06-11 00:09:28 +00:00
Resolves issue #13
- Switched model to bge-small-en-v1.5 - Added lazy re-embedding - Added model version tracking per vector - Added conversion of vectors to the new model
This commit is contained in:
Executable
+121
@@ -0,0 +1,121 @@
|
||||
#!/bin/bash
|
||||
# Download BGE-small-en-v1.5 model for embedding
|
||||
# Usage: ./download-bge-model.sh [--force]
|
||||
# Use --force to re-download even if files exist
|
||||
|
||||
set -e
|
||||
|
||||
MODEL_NAME="bge-small-en-v1.5"
|
||||
MODEL_REPO="BAAI/bge-small-en-v1.5"
|
||||
ASSETS_DIR="internal/embedding/assets"
|
||||
VERSION_FILE="${ASSETS_DIR}/.model_version"
|
||||
FORCE_DOWNLOAD=false
|
||||
|
||||
# Check for --force flag
|
||||
for arg in "$@"; do
|
||||
if [ "$arg" = "--force" ]; then
|
||||
FORCE_DOWNLOAD=true
|
||||
fi
|
||||
done
|
||||
|
||||
# Temporary directory for downloads
|
||||
TEMP_DIR=$(mktemp -d)
|
||||
trap "rm -rf ${TEMP_DIR}" EXIT
|
||||
|
||||
# Check if model already exists
|
||||
model_exists() {
|
||||
[ -f "${ASSETS_DIR}/model.onnx" ] && [ -f "${ASSETS_DIR}/tokenizer.json" ]
|
||||
}
|
||||
|
||||
# Get installed version
|
||||
get_installed_version() {
|
||||
if [ -f "$VERSION_FILE" ]; then
|
||||
cat "$VERSION_FILE"
|
||||
else
|
||||
echo ""
|
||||
fi
|
||||
}
|
||||
|
||||
# Write version file
|
||||
write_version_file() {
|
||||
echo "${MODEL_NAME}" > "$VERSION_FILE"
|
||||
}
|
||||
|
||||
download_model() {
|
||||
echo "Downloading ${MODEL_NAME} from Hugging Face..."
|
||||
|
||||
# Create assets directory
|
||||
mkdir -p "${ASSETS_DIR}"
|
||||
|
||||
# Download ONNX model
|
||||
# BGE models have ONNX exports available in the repo
|
||||
echo "Downloading ONNX model..."
|
||||
curl -fsSL \
|
||||
"https://huggingface.co/${MODEL_REPO}/resolve/main/onnx/model.onnx" \
|
||||
-o "${TEMP_DIR}/model.onnx"
|
||||
|
||||
# Download tokenizer.json
|
||||
echo "Downloading tokenizer..."
|
||||
curl -fsSL \
|
||||
"https://huggingface.co/${MODEL_REPO}/resolve/main/tokenizer.json" \
|
||||
-o "${TEMP_DIR}/tokenizer.json"
|
||||
|
||||
# Verify files exist and have content
|
||||
if [ ! -s "${TEMP_DIR}/model.onnx" ]; then
|
||||
echo "Error: Failed to download model.onnx or file is empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -s "${TEMP_DIR}/tokenizer.json" ]; then
|
||||
echo "Error: Failed to download tokenizer.json or file is empty"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Move to assets directory (backup old files first)
|
||||
if [ -f "${ASSETS_DIR}/model.onnx" ]; then
|
||||
mv "${ASSETS_DIR}/model.onnx" "${ASSETS_DIR}/model.onnx.bak"
|
||||
fi
|
||||
if [ -f "${ASSETS_DIR}/tokenizer.json" ]; then
|
||||
mv "${ASSETS_DIR}/tokenizer.json" "${ASSETS_DIR}/tokenizer.json.bak"
|
||||
fi
|
||||
|
||||
mv "${TEMP_DIR}/model.onnx" "${ASSETS_DIR}/model.onnx"
|
||||
mv "${TEMP_DIR}/tokenizer.json" "${ASSETS_DIR}/tokenizer.json"
|
||||
|
||||
# Remove backups on success
|
||||
rm -f "${ASSETS_DIR}/model.onnx.bak" "${ASSETS_DIR}/tokenizer.json.bak"
|
||||
|
||||
# Write version file
|
||||
write_version_file
|
||||
|
||||
echo "Model size: $(du -h "${ASSETS_DIR}/model.onnx" | cut -f1)"
|
||||
echo "Tokenizer size: $(du -h "${ASSETS_DIR}/tokenizer.json" | cut -f1)"
|
||||
}
|
||||
|
||||
echo "BGE Model Downloader - ${MODEL_NAME}"
|
||||
echo "=================================="
|
||||
|
||||
need_download=false
|
||||
reason=""
|
||||
|
||||
if [ "$FORCE_DOWNLOAD" = true ]; then
|
||||
need_download=true
|
||||
reason="forced"
|
||||
elif ! model_exists; then
|
||||
need_download=true
|
||||
reason="not found"
|
||||
elif [ "$(get_installed_version)" != "${MODEL_NAME}" ]; then
|
||||
need_download=true
|
||||
reason="version mismatch (installed: $(get_installed_version), required: ${MODEL_NAME})"
|
||||
fi
|
||||
|
||||
if [ "$need_download" = true ]; then
|
||||
if [ -n "$reason" ] && [ "$reason" != "not found" ]; then
|
||||
echo "Re-downloading: ${reason}"
|
||||
fi
|
||||
download_model
|
||||
echo "Done! ${MODEL_NAME} installed successfully."
|
||||
else
|
||||
echo "Model ${MODEL_NAME} already exists, skipping download."
|
||||
echo "Use --force to re-download."
|
||||
fi
|
||||
Reference in New Issue
Block a user