errors merging karcher
File "C:\mergekit-main\mergekit\graph.py", line 484, in _run
res = task.execute(**arguments)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\mergekit-main\mergekit\merge_methods\karcher.py", line 55, in execute
return karcher_merge_tensors(
^^^^^^^^^^^^^^^^^^^^^^
File "C:\mergekit-main\mergekit\merge_methods\karcher.py", line 153, in karcher_merge_tensors
u += a * ui
RuntimeError: The size of tensor a (131072) must match the size of tensor b (131078) at non-singleton dimension 0
It seems to merge for SLERP but not Karcher. Any ideas how to fix this? I ran into a similar issue with Impish Magic 24B.
I don't know if this will fix the issues with merge bugs and repetition but am testing it now with the original NousResearch safetensors.
Gemini claims this should work and so far its merging without errors, so I'll test it with your safetensors next and report back the results.
vocab_resizer.py
import torch
from safetensors.torch import save_file, load_file
import os
import json
import glob
from collections import OrderedDict
def fix_vocab_size(model_dir, output_dir, new_vocab_size=131072):
"""
Resizes the vocabulary-dependent tensors of a sharded model to a new size.
Args:
model_dir (str): The directory of the model to fix.
output_dir (str): The directory where the fixed model will be saved.
new_vocab_size (int): The target vocabulary size.
"""
try:
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created output directory: {output_dir}")
# --- Step 1: Find all safetensor shards ---
search_pattern = os.path.join(model_dir, '*.safetensors')
shard_paths = sorted(glob.glob(search_pattern))
if not shard_paths:
print(f"Error: No '.safetensors' files found in {model_dir}")
return
print(f"Found {len(shard_paths)} shards to process.")
# --- Step 2: Identify which shards contain the vocab tensors ---
vocab_tensor_keys = ["model.embed_tokens.weight", "lm_head.weight"]
shards_to_modify = {} # {filename: {key: tensor}}
for shard_path in shard_paths:
with open(shard_path, "rb") as f:
data = f.read()
header_size = int.from_bytes(data[:8], 'little')
header_str = data[8:8+header_size].decode('utf-8')
header = json.loads(header_str)
for key in header.keys():
if key in vocab_tensor_keys:
filename = os.path.basename(shard_path)
if filename not in shards_to_modify:
shards_to_modify[filename] = {}
# Load only the specific tensor
shards_to_modify[filename][key] = load_file(shard_path)[key]
print(f"Found '{key}' in shard: {filename}")
if not shards_to_modify:
print("Error: Could not find 'embed_tokens' or 'lm_head' tensors in any shard.")
return
# --- Step 3: Process all shards, modifying the ones with vocab tensors ---
for shard_path in shard_paths:
filename = os.path.basename(shard_path)
output_shard_path = os.path.join(output_dir, filename)
# Load all tensors from the current shard
tensors = load_file(shard_path)
if filename in shards_to_modify:
print(f"Resizing tensors in {filename}...")
for key, tensor in shards_to_modify[filename].items():
original_size = tensor.shape[0]
print(f" - Resizing '{key}' from {original_size} to {new_vocab_size}")
# Trim the tensor to the new vocabulary size
resized_tensor = tensor[:new_vocab_size, :]
tensors[key] = resized_tensor # Replace the tensor in the loaded dict
# Save the (potentially modified) tensors to the new location
save_file(tensors, output_shard_path)
print(f"Saved new shard: {output_shard_path}")
# --- Step 4: Modify and save the config.json ---
config_path = os.path.join(model_dir, 'config.json')
new_config_path = os.path.join(output_dir, 'config.json')
if os.path.exists(config_path):
with open(config_path, 'r') as f:
config = json.load(f)
print(f"\nUpdating config.json: 'vocab_size' from {config.get('vocab_size')} to {new_vocab_size}")
config['vocab_size'] = new_vocab_size
with open(new_config_path, 'w') as f:
json.dump(config, f, indent=2)
print(f"Saved new config.json to {new_config_path}")
else:
print("Warning: config.json not found. Please create it manually.")
# --- Step 5: Copy other essential files ---
for filename in os.listdir(model_dir):
if filename.endswith(('.json', '.py', '.md', '.txt')) and filename != 'config.json':
if not os.path.exists(os.path.join(output_dir, filename)):
import shutil
shutil.copy2(os.path.join(model_dir, filename), output_dir)
print(f"Copied {filename} to output directory.")
print("\nVocabulary resizing complete. The model is now ready for merging.")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == "__main__":
# --- Configuration ---
# Directory of the original DeepHermes-24B model
input_model_directory = r"path/to/your/DeepHermes-24B"
# Directory to save the fixed, merge-ready model
output_model_directory = r"path/to/your/DeepHermes-24B-fixed"
# The standard vocab size you are targeting for the merge
target_vocab_size = 131072
# --- Run the script ---
fix_vocab_size(input_model_directory, output_model_directory, target_vocab_size)
Hiya!! just at a glance? it's a vocab/embedding mismatch, rather than a possible a mergekit bug.
I have to repair vocab sizes in merges on occasion because I merge older models with newer ones. pain in the arse.
I gotta equal the vocab sizes before merging. check each model’s config.json for "vocab_size".
If they differ? there are two fixes. the easiest is to use the one with the smaller size as the target and trim the embedding tensors in the others. But that's not always the choice, right? there are various fixes. I've tried "padding" (annoying and troublesome and can lead to looping). but, there is also the "copy the larger tokenizer and pasting it into the model with the smaller tokeniser" file trick (make sure you have a copy to put the orig back in if needed) . I haven't tried that one yet. Trimming is "safer" than stuffing in terms of "shiz may go sideways post merge" 🤪
anyway, hope that's any help at all.
whoops missed a HUGE chunk of that!!! wasn't showing in my viewer!! Apologies!!!! please ignore
Thanks for the tips. I'm uploading a GGUF for test98 and test99 to compare. So far in kobold not seeing any bugs with test 98 Q6_K
I'll quantize this model too for comparison. Getting < tool_call > spam with the regular DeepHermes GGUF
Edit: Running into quantization errors with karcher:
assert max(tokenizer.vocab.values()) < vocab_size
Yeah. little bugger kept tool calling out for pizza on me as well. had to take out all tool tokens. Good luck!
I'll try to patch halfeaten salad later too if i have time
Vortex5/ChaosRose-24B
using the vocab resize to see if this can fix the repetition
cant get karcher to merge yet (strange errors) but the SLERP merge of magidonia and deephermes is working well, no tool calls. need to test union tokenizer still
q6 should be up soon. https://huggingface.co/Naphula/test98
that's kind if you manage!! I'm trying to test it today but runpod and vast keeps singing me CUDA errors when I build the vllm in it and my forehead is getting bruised.
I reaaaaally should be doing my research report for my akchewal job? heh. many thanks if you get round to it, no worries if you don't.
oh! actually it's great! there was a lot of repetition with blathering sentences "he said" "he moved" "he tilted" etc etc instead of making paragraphs - but it's just fiddling with the parameters... temp 0.8 with min_p cranked to 0.2 - being a surprisingly decently written, rather charming rouge with the fp16. I'll have to quant another day in find out how the quants do. Thanks for the offer Naphula, seems ok.