Spaces:

Naphula
/

model_tools

Running

App Files Files Community

model_tools / metadata_audit.py

Naphula

Update metadata_audit.py

5948696 verified 16 days ago

raw

history blame contribute delete

4.43 kB

	import os
	import json
	import sys

	# ==============================================================================
	# CONFIGURATION
	# ==============================================================================
	SEARCH_ROOTS = [
	r"B:\LLM\.cache\huggingface\hub",
	r"C:\.cache"
	]

	# Mistral Nemo Baseline (Tekken Tokenizer)
	BASELINE_VOCAB = 131072
	BASELINE_ROPE = 1000000.0

	# ==============================================================================
	# SCRIPT
	# ==============================================================================

	def find_model_paths(roots):
	model_paths = []
	print(f"🔍 Scanning directories for config.json...")

	for root_dir in roots:
	if not os.path.exists(root_dir):
	print(f"⚠️ Warning: Directory not found: {root_dir}")
	continue

	print(f" -> Crawling {root_dir} (this may take a moment)...")
	count = 0
	for root, dirs, files in os.walk(root_dir):
	if "config.json" in files:
	model_paths.append(root)
	count += 1
	# Optional: Optimization to stop diving deeper if we found a model root
	# (Commented out to ensure we find snapshots in HF cache structure)
	# dirs[:] = []
	print(f" Found {count} models in {root_dir}")

	return model_paths

	def check_models():
	paths = find_model_paths(SEARCH_ROOTS)

	if not paths:
	print("\n❌ No models found in the specified directories.")
	return

	print("\n" + "="*110)
	print(f"{'Model Name (Short)':<45} \| {'Vocab':<8} \| {'RoPE Theta':<12} \| {'EOS ID':<8} \| {'Status'}")
	print("="*110)

	suspects = []

	for path in paths:
	config_path = os.path.join(path, "config.json")

	try:
	with open(config_path, 'r', encoding='utf-8') as f:
	cfg = json.load(f)
	except Exception as e:
	print(f"❌ Error reading {path}: {e}")
	continue

	# Extract Metadata
	vocab_size = cfg.get("vocab_size", 0)
	rope_theta = cfg.get("rope_theta", 0.0)
	eos_id = cfg.get("eos_token_id", "N/A")
	arch = cfg.get("architectures", ["Unknown"])[0]

	# Clean up the name for display (handle HF cache paths)
	name = os.path.basename(path)
	if "snapshots" in path:
	# Try to get the folder name above 'snapshots' for better readability
	try:
	parent = os.path.dirname(os.path.dirname(path))
	name = os.path.basename(parent).replace("models--", "")
	except:
	pass

	# --- THE AUDIT LOGIC ---
	flags = []
	is_suspect = False

	# Check Vocab (The most likely cause of your EOS bug)
	# Mistral Nemo is 131072. Llama 3 is 128256. Old Mistral is 32000.
	if vocab_size != BASELINE_VOCAB:
	flags.append(f"VOCAB({vocab_size})")
	is_suspect = True

	# Check RoPE (Nemo is 1,000,000. Standard is 10,000)
	if float(rope_theta) != float(BASELINE_ROPE):
	flags.append(f"ROPE({int(rope_theta)})")
	is_suspect = True

	# Check EOS (Multi-EOS can confuse mergekit)
	if isinstance(eos_id, list) and len(eos_id) > 1:
	flags.append("MULTI-EOS")
	# This isn't always fatal, but good to know

	status = "✅ OK" if not is_suspect else f"🚩 {' '.join(flags)}"

	# Print Row
	print(f"{name[:45]:<45} \| {str(vocab_size):<8} \| {str(rope_theta):<12} \| {str(eos_id):<8} \| {status}")

	if is_suspect:
	suspects.append((name, path, flags))

	print("\n" + "="*110)
	if suspects:
	print(f"🚨 DETECTED {len(suspects)} POTENTIALLY INCOMPATIBLE MODELS:")
	print("These models do not match the Mistral Nemo baseline (Vocab 131k, RoPE 1M).")
	print("Including them in the merge is likely causing the 'One Sentence' bug. Use vocab_resizer.py to fix.\n")
	for s_name, s_path, s_flags in suspects:
	print(f"❌ {s_name}")
	print(f" Path: {s_path}")
	print(f" Issues: {', '.join(s_flags)}\n")
	else:
	print("✅ All scanned models match the Mistral Nemo baseline specs.")

	if __name__ == "__main__":
	check_models()