change image

by callsys - opened Aug 28, 2024

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+108229

-474051

Files changed (7) hide show

README.md +4 -118
config.json +3 -3
md.py +2 -2
ocr.py +2 -2
special_tokens_map.json +0 -33
tokenizer.json +0 -0
tokenizer_config.json +0 -0

README.md CHANGED Viewed

@@ -1,8 +1,6 @@
 ---
 language: en
 license: mit
-library_name: transformers
-pipeline_tag: image-text-to-text
 ---
 # Kosmos-2.5
@@ -18,125 +16,10 @@ Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
 ## Inference
-KOSMOS-2.5 is supported from Transformers >= 4.56. Find the docs [here](https://huggingface.co/docs/transformers/main/en/model_doc/kosmos2_5).
 **Markdown Task:** For usage instructions, please refer to [md.py](md.py).
-```py
-import re
-import torch
-import requests
-from PIL import Image, ImageDraw
-from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device
-repo = "microsoft/kosmos-2.5"
-device = "cuda:0"
-dtype = torch.bfloat16
-model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
-processor = AutoProcessor.from_pretrained(repo)
-# sample image
-url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
-image = Image.open(requests.get(url, stream=True).raw)
-prompt = "<md>"
-inputs = processor(text=prompt, images=image, return_tensors="pt")
-height, width = inputs.pop("height"), inputs.pop("width")
-raw_width, raw_height = image.size
-scale_height = raw_height / height
-scale_width = raw_width / width
-inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
-inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
-generated_ids = model.generate(
-    **inputs,
-    max_new_tokens=1024,
-)
-generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-print(generated_text[0])
-```
 **OCR Task:** For usage instructions, please refer to [ocr.py](ocr.py).
-```py
-import re
-import torch
-import requests
-from PIL import Image, ImageDraw
-from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration, infer_device
-repo = "microsoft/kosmos-2.5"
-device = "cuda:0"
-dtype = torch.bfloat16
-model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, dtype=dtype)
-processor = AutoProcessor.from_pretrained(repo)
-# sample image
-url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
-image = Image.open(requests.get(url, stream=True).raw)
-# bs = 1
-prompt = "<ocr>"
-inputs = processor(text=prompt, images=image, return_tensors="pt")
-height, width = inputs.pop("height"), inputs.pop("width")
-raw_width, raw_height = image.size
-scale_height = raw_height / height
-scale_width = raw_width / width
-# bs > 1, batch generation
-# inputs = processor(text=[prompt, prompt], images=[image,image], return_tensors="pt")
-# height, width = inputs.pop("height"), inputs.pop("width")
-# raw_width, raw_height = image.size
-# scale_height = raw_height / height[0]
-# scale_width = raw_width / width[0]
-inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
-inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
-generated_ids = model.generate(
-    **inputs,
-    max_new_tokens=1024,
-)
-generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-def post_process(y, scale_height, scale_width):
-    y = y.replace(prompt, "")
-    if "<md>" in prompt:
-        return y
-    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
-    bboxs_raw = re.findall(pattern, y)
-    lines = re.split(pattern, y)[1:]
-    bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
-    bboxs = [[int(j) for j in i] for i in bboxs]
-    info = ""
-    for i in range(len(lines)):
-        box = bboxs[i]
-        x0, y0, x1, y1 = box
-        if not (x0 >= x1 or y0 >= y1):
-            x0 = int(x0 * scale_width)
-            y0 = int(y0 * scale_height)
-            x1 = int(x1 * scale_width)
-            y1 = int(y1 * scale_height)
-            info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
-    return info
-output_text = post_process(generated_text[0], scale_height, scale_width)
-print(output_text)
-draw = ImageDraw.Draw(image)
-lines = output_text.split("\n")
-for line in lines:
-    # draw the bounding box
-    line = list(line.split(","))
-    if len(line) < 8:
-        continue
-    line = list(map(int, line[:8]))
-    draw.polygon(line, outline="red")
-image.save("output.png")
-```
 ## Citation
 If you find Kosmos-2.5 useful in your research, please cite the following paper:
@@ -153,4 +36,7 @@ If you find Kosmos-2.5 useful in your research, please cite the following paper:
 ## License
 The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
-[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)

 ---
 language: en
 license: mit
 ---
 # Kosmos-2.5
 Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
 ## Inference
 **Markdown Task:** For usage instructions, please refer to [md.py](md.py).
 **OCR Task:** For usage instructions, please refer to [ocr.py](ocr.py).
 ## Citation
 If you find Kosmos-2.5 useful in your research, please cite the following paper:
 ## License
 The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
+[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)

config.json CHANGED Viewed

@@ -82,8 +82,8 @@
     "bos_token_id": null,
     "chunk_size_feed_forward": 0,
     "cross_attention_hidden_size": null,
-    "intermediate_size": 3968,
-    "head_dim": 64,
     "decoder_start_token_id": null,
     "dense_act_fn": "gelu_new",
     "diversity_penalty": 0.0,
@@ -133,7 +133,7 @@
     "return_dict": true,
     "return_dict_in_generate": false,
     "sep_token_id": null,
-    "max_num_patches": 4096,
     "suppress_tokens": null,
     "task_specific_params": null,
     "temperature": 1.0,

     "bos_token_id": null,
     "chunk_size_feed_forward": 0,
     "cross_attention_hidden_size": null,
+    "d_ff": 3968,
+    "d_kv": 64,
     "decoder_start_token_id": null,
     "dense_act_fn": "gelu_new",
     "diversity_penalty": 0.0,
     "return_dict": true,
     "return_dict_in_generate": false,
     "sep_token_id": null,
+    "seq_len": 4096,
     "suppress_tokens": null,
     "task_specific_params": null,
     "temperature": 1.0,

md.py CHANGED Viewed

@@ -11,7 +11,7 @@ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=devic
 processor = AutoProcessor.from_pretrained(repo)
 # sample image
-url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<md>"
@@ -30,4 +30,4 @@ generated_ids = model.generate(
 )
 generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-print(generated_text[0])

 processor = AutoProcessor.from_pretrained(repo)
 # sample image
+url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 prompt = "<md>"
 )
 generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text[0])

ocr.py CHANGED Viewed

@@ -11,7 +11,7 @@ model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=devic
 processor = AutoProcessor.from_pretrained(repo)
 # sample image
-url = "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 # bs = 1
@@ -70,4 +70,4 @@ for line in lines:
         continue
     line = list(map(int, line[:8]))
     draw.polygon(line, outline="red")
-image.save("output.png")

 processor = AutoProcessor.from_pretrained(repo)
 # sample image
+url = "https://huggingface.co/microsoft/kosmos-2.5/blob/main/receipt_00008.png"
 image = Image.open(requests.get(url, stream=True).raw)
 # bs = 1
         continue
     line = list(map(int, line[:8]))
     draw.polygon(line, outline="red")
+image.save("output.png")

special_tokens_map.json DELETED Viewed

@@ -1,33 +0,0 @@
-{
-  "boi_token": "<image>",
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eoi_token": "</image>",
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "image_token": "<s>",
-  "pad_token": {
-    "content": "<pad>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

The diff for this file is too large to render. See raw diff