errors merging karcher

#1
by Naphula - opened
  File "C:\mergekit-main\mergekit\graph.py", line 484, in _run
    res = task.execute(**arguments)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\mergekit-main\mergekit\merge_methods\karcher.py", line 55, in execute
    return karcher_merge_tensors(
           ^^^^^^^^^^^^^^^^^^^^^^
  File "C:\mergekit-main\mergekit\merge_methods\karcher.py", line 153, in karcher_merge_tensors
    u += a * ui
RuntimeError: The size of tensor a (131072) must match the size of tensor b (131078) at non-singleton dimension 0

It seems to merge for SLERP but not Karcher. Any ideas how to fix this? I ran into a similar issue with Impish Magic 24B.

I don't know if this will fix the issues with merge bugs and repetition but am testing it now with the original NousResearch safetensors.

Gemini claims this should work and so far its merging without errors, so I'll test it with your safetensors next and report back the results.

vocab_resizer.py

import torch
from safetensors.torch import save_file, load_file
import os
import json
import glob
from collections import OrderedDict

def fix_vocab_size(model_dir, output_dir, new_vocab_size=131072):
    """
    Resizes the vocabulary-dependent tensors of a sharded model to a new size.

    Args:
        model_dir (str): The directory of the model to fix.
        output_dir (str): The directory where the fixed model will be saved.
        new_vocab_size (int): The target vocabulary size.
    """
    try:
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")

        # --- Step 1: Find all safetensor shards ---
        search_pattern = os.path.join(model_dir, '*.safetensors')
        shard_paths = sorted(glob.glob(search_pattern))
        if not shard_paths:
            print(f"Error: No '.safetensors' files found in {model_dir}")
            return
        
        print(f"Found {len(shard_paths)} shards to process.")

        # --- Step 2: Identify which shards contain the vocab tensors ---
        vocab_tensor_keys = ["model.embed_tokens.weight", "lm_head.weight"]
        shards_to_modify = {} # {filename: {key: tensor}}

        for shard_path in shard_paths:
            with open(shard_path, "rb") as f:
                data = f.read()
            header_size = int.from_bytes(data[:8], 'little')
            header_str = data[8:8+header_size].decode('utf-8')
            header = json.loads(header_str)
            
            for key in header.keys():
                if key in vocab_tensor_keys:
                    filename = os.path.basename(shard_path)
                    if filename not in shards_to_modify:
                        shards_to_modify[filename] = {}
                    # Load only the specific tensor
                    shards_to_modify[filename][key] = load_file(shard_path)[key]
                    print(f"Found '{key}' in shard: {filename}")

        if not shards_to_modify:
            print("Error: Could not find 'embed_tokens' or 'lm_head' tensors in any shard.")
            return

        # --- Step 3: Process all shards, modifying the ones with vocab tensors ---
        for shard_path in shard_paths:
            filename = os.path.basename(shard_path)
            output_shard_path = os.path.join(output_dir, filename)

            # Load all tensors from the current shard
            tensors = load_file(shard_path)

            if filename in shards_to_modify:
                print(f"Resizing tensors in {filename}...")
                for key, tensor in shards_to_modify[filename].items():
                    original_size = tensor.shape[0]
                    print(f"  - Resizing '{key}' from {original_size} to {new_vocab_size}")
                    # Trim the tensor to the new vocabulary size
                    resized_tensor = tensor[:new_vocab_size, :]
                    tensors[key] = resized_tensor # Replace the tensor in the loaded dict
            
            # Save the (potentially modified) tensors to the new location
            save_file(tensors, output_shard_path)
            print(f"Saved new shard: {output_shard_path}")

        # --- Step 4: Modify and save the config.json ---
        config_path = os.path.join(model_dir, 'config.json')
        new_config_path = os.path.join(output_dir, 'config.json')
        if os.path.exists(config_path):
            with open(config_path, 'r') as f:
                config = json.load(f)
            
            print(f"\nUpdating config.json: 'vocab_size' from {config.get('vocab_size')} to {new_vocab_size}")
            config['vocab_size'] = new_vocab_size
            
            with open(new_config_path, 'w') as f:
                json.dump(config, f, indent=2)
            print(f"Saved new config.json to {new_config_path}")
        else:
            print("Warning: config.json not found. Please create it manually.")

        # --- Step 5: Copy other essential files ---
        for filename in os.listdir(model_dir):
            if filename.endswith(('.json', '.py', '.md', '.txt')) and filename != 'config.json':
                 if not os.path.exists(os.path.join(output_dir, filename)):
                    import shutil
                    shutil.copy2(os.path.join(model_dir, filename), output_dir)
                    print(f"Copied {filename} to output directory.")

        print("\nVocabulary resizing complete. The model is now ready for merging.")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # --- Configuration ---
    # Directory of the original DeepHermes-24B model
    input_model_directory = r"path/to/your/DeepHermes-24B"

    # Directory to save the fixed, merge-ready model
    output_model_directory = r"path/to/your/DeepHermes-24B-fixed"
    
    # The standard vocab size you are targeting for the merge
    target_vocab_size = 131072

    # --- Run the script ---
    fix_vocab_size(input_model_directory, output_model_directory, target_vocab_size)
Owner

Hiya!! just at a glance? it's a vocab/embedding mismatch, rather than a possible a mergekit bug.

I have to repair vocab sizes in merges on occasion because I merge older models with newer ones. pain in the arse.
I gotta equal the vocab sizes before merging. check each model’s config.json for "vocab_size".
If they differ? there are two fixes. the easiest is to use the one with the smaller size as the target and trim the embedding tensors in the others. But that's not always the choice, right? there are various fixes. I've tried "padding" (annoying and troublesome and can lead to looping). but, there is also the "copy the larger tokenizer and pasting it into the model with the smaller tokeniser" file trick (make sure you have a copy to put the orig back in if needed) . I haven't tried that one yet. Trimming is "safer" than stuffing in terms of "shiz may go sideways post merge" 🤪

anyway, hope that's any help at all.

Owner

whoops missed a HUGE chunk of that!!! wasn't showing in my viewer!! Apologies!!!! please ignore

Thanks for the tips. I'm uploading a GGUF for test98 and test99 to compare. So far in kobold not seeing any bugs with test 98 Q6_K

I'll quantize this model too for comparison. Getting < tool_call > spam with the regular DeepHermes GGUF

Edit: Running into quantization errors with karcher:

assert max(tokenizer.vocab.values()) < vocab_size

Owner

Yeah. little bugger kept tool calling out for pizza on me as well. had to take out all tool tokens. Good luck!

I'll try to patch halfeaten salad later too if i have time
Vortex5/ChaosRose-24B
using the vocab resize to see if this can fix the repetition

cant get karcher to merge yet (strange errors) but the SLERP merge of magidonia and deephermes is working well, no tool calls. need to test union tokenizer still

q6 should be up soon. https://huggingface.co/Naphula/test98

Owner

that's kind if you manage!! I'm trying to test it today but runpod and vast keeps singing me CUDA errors when I build the vllm in it and my forehead is getting bruised.
I reaaaaally should be doing my research report for my akchewal job? heh. many thanks if you get round to it, no worries if you don't.

Owner

oh! actually it's great! there was a lot of repetition with blathering sentences "he said" "he moved" "he tilted" etc etc instead of making paragraphs - but it's just fiddling with the parameters... temp 0.8 with min_p cranked to 0.2 - being a surprisingly decently written, rather charming rouge with the fp16. I'll have to quant another day in find out how the quants do. Thanks for the offer Naphula, seems ok.

Sign up or log in to comment