File size: 4,430 Bytes
5948696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f4bde3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import json
import sys

# ==============================================================================
# CONFIGURATION
# ==============================================================================
SEARCH_ROOTS = [
    r"B:\LLM\.cache\huggingface\hub",
    r"C:\.cache"
]

# Mistral Nemo Baseline (Tekken Tokenizer)
BASELINE_VOCAB = 131072
BASELINE_ROPE = 1000000.0

# ==============================================================================
# SCRIPT
# ==============================================================================

def find_model_paths(roots):
    model_paths = []
    print(f"πŸ” Scanning directories for config.json...")
    
    for root_dir in roots:
        if not os.path.exists(root_dir):
            print(f"⚠️  Warning: Directory not found: {root_dir}")
            continue
            
        print(f"   -> Crawling {root_dir} (this may take a moment)...")
        count = 0
        for root, dirs, files in os.walk(root_dir):
            if "config.json" in files:
                model_paths.append(root)
                count += 1
                # Optional: Optimization to stop diving deeper if we found a model root
                # (Commented out to ensure we find snapshots in HF cache structure)
                # dirs[:] = [] 
        print(f"      Found {count} models in {root_dir}")
        
    return model_paths

def check_models():
    paths = find_model_paths(SEARCH_ROOTS)
    
    if not paths:
        print("\n❌ No models found in the specified directories.")
        return

    print("\n" + "="*110)
    print(f"{'Model Name (Short)':<45} | {'Vocab':<8} | {'RoPE Theta':<12} | {'EOS ID':<8} | {'Status'}")
    print("="*110)
    
    suspects = []
    
    for path in paths:
        config_path = os.path.join(path, "config.json")
        
        try:
            with open(config_path, 'r', encoding='utf-8') as f:
                cfg = json.load(f)
        except Exception as e:
            print(f"❌ Error reading {path}: {e}")
            continue
            
        # Extract Metadata
        vocab_size = cfg.get("vocab_size", 0)
        rope_theta = cfg.get("rope_theta", 0.0)
        eos_id = cfg.get("eos_token_id", "N/A")
        arch = cfg.get("architectures", ["Unknown"])[0]
        
        # Clean up the name for display (handle HF cache paths)
        name = os.path.basename(path)
        if "snapshots" in path:
            # Try to get the folder name above 'snapshots' for better readability
            try:
                parent = os.path.dirname(os.path.dirname(path))
                name = os.path.basename(parent).replace("models--", "")
            except:
                pass

        # --- THE AUDIT LOGIC ---
        flags = []
        is_suspect = False
        
        # Check Vocab (The most likely cause of your EOS bug)
        # Mistral Nemo is 131072. Llama 3 is 128256. Old Mistral is 32000.
        if vocab_size != BASELINE_VOCAB:
            flags.append(f"VOCAB({vocab_size})")
            is_suspect = True
            
        # Check RoPE (Nemo is 1,000,000. Standard is 10,000)
        if float(rope_theta) != float(BASELINE_ROPE):
            flags.append(f"ROPE({int(rope_theta)})")
            is_suspect = True
            
        # Check EOS (Multi-EOS can confuse mergekit)
        if isinstance(eos_id, list) and len(eos_id) > 1:
            flags.append("MULTI-EOS")
            # This isn't always fatal, but good to know
            
        status = "βœ… OK" if not is_suspect else f"🚩 {' '.join(flags)}"
        
        # Print Row
        print(f"{name[:45]:<45} | {str(vocab_size):<8} | {str(rope_theta):<12} | {str(eos_id):<8} | {status}")
        
        if is_suspect:
            suspects.append((name, path, flags))

    print("\n" + "="*110)
    if suspects:
        print(f"🚨 DETECTED {len(suspects)} POTENTIALLY INCOMPATIBLE MODELS:")
        print("These models do not match the Mistral Nemo baseline (Vocab 131k, RoPE 1M).")
        print("Including them in the merge is likely causing the 'One Sentence' bug. Use vocab_resizer.py to fix.\n")
        for s_name, s_path, s_flags in suspects:
            print(f"❌ {s_name}")
            print(f"   Path: {s_path}")
            print(f"   Issues: {', '.join(s_flags)}\n")
    else:
        print("βœ… All scanned models match the Mistral Nemo baseline specs.")

if __name__ == "__main__":
    check_models()