model_tools / metadata_audit.py
Naphula's picture
Update metadata_audit.py
5948696 verified
import os
import json
import sys
# ==============================================================================
# CONFIGURATION
# ==============================================================================
SEARCH_ROOTS = [
r"B:\LLM\.cache\huggingface\hub",
r"C:\.cache"
]
# Mistral Nemo Baseline (Tekken Tokenizer)
BASELINE_VOCAB = 131072
BASELINE_ROPE = 1000000.0
# ==============================================================================
# SCRIPT
# ==============================================================================
def find_model_paths(roots):
model_paths = []
print(f"πŸ” Scanning directories for config.json...")
for root_dir in roots:
if not os.path.exists(root_dir):
print(f"⚠️ Warning: Directory not found: {root_dir}")
continue
print(f" -> Crawling {root_dir} (this may take a moment)...")
count = 0
for root, dirs, files in os.walk(root_dir):
if "config.json" in files:
model_paths.append(root)
count += 1
# Optional: Optimization to stop diving deeper if we found a model root
# (Commented out to ensure we find snapshots in HF cache structure)
# dirs[:] = []
print(f" Found {count} models in {root_dir}")
return model_paths
def check_models():
paths = find_model_paths(SEARCH_ROOTS)
if not paths:
print("\n❌ No models found in the specified directories.")
return
print("\n" + "="*110)
print(f"{'Model Name (Short)':<45} | {'Vocab':<8} | {'RoPE Theta':<12} | {'EOS ID':<8} | {'Status'}")
print("="*110)
suspects = []
for path in paths:
config_path = os.path.join(path, "config.json")
try:
with open(config_path, 'r', encoding='utf-8') as f:
cfg = json.load(f)
except Exception as e:
print(f"❌ Error reading {path}: {e}")
continue
# Extract Metadata
vocab_size = cfg.get("vocab_size", 0)
rope_theta = cfg.get("rope_theta", 0.0)
eos_id = cfg.get("eos_token_id", "N/A")
arch = cfg.get("architectures", ["Unknown"])[0]
# Clean up the name for display (handle HF cache paths)
name = os.path.basename(path)
if "snapshots" in path:
# Try to get the folder name above 'snapshots' for better readability
try:
parent = os.path.dirname(os.path.dirname(path))
name = os.path.basename(parent).replace("models--", "")
except:
pass
# --- THE AUDIT LOGIC ---
flags = []
is_suspect = False
# Check Vocab (The most likely cause of your EOS bug)
# Mistral Nemo is 131072. Llama 3 is 128256. Old Mistral is 32000.
if vocab_size != BASELINE_VOCAB:
flags.append(f"VOCAB({vocab_size})")
is_suspect = True
# Check RoPE (Nemo is 1,000,000. Standard is 10,000)
if float(rope_theta) != float(BASELINE_ROPE):
flags.append(f"ROPE({int(rope_theta)})")
is_suspect = True
# Check EOS (Multi-EOS can confuse mergekit)
if isinstance(eos_id, list) and len(eos_id) > 1:
flags.append("MULTI-EOS")
# This isn't always fatal, but good to know
status = "βœ… OK" if not is_suspect else f"🚩 {' '.join(flags)}"
# Print Row
print(f"{name[:45]:<45} | {str(vocab_size):<8} | {str(rope_theta):<12} | {str(eos_id):<8} | {status}")
if is_suspect:
suspects.append((name, path, flags))
print("\n" + "="*110)
if suspects:
print(f"🚨 DETECTED {len(suspects)} POTENTIALLY INCOMPATIBLE MODELS:")
print("These models do not match the Mistral Nemo baseline (Vocab 131k, RoPE 1M).")
print("Including them in the merge is likely causing the 'One Sentence' bug. Use vocab_resizer.py to fix.\n")
for s_name, s_path, s_flags in suspects:
print(f"❌ {s_name}")
print(f" Path: {s_path}")
print(f" Issues: {', '.join(s_flags)}\n")
else:
print("βœ… All scanned models match the Mistral Nemo baseline specs.")
if __name__ == "__main__":
check_models()