| | import json |
| | import os |
| | import argparse |
| | from tqdm import tqdm |
| | import tiktoken |
| | from openai import OpenAI |
| | from huggingface_hub import hf_hub_download |
| |
|
| | def gpt_4o(input_text): |
| | client=OpenAI(api_key=os.environ.get("OAI")) |
| | response = client.chat.completions.create( |
| | model="gpt-4o", |
| | messages=[ |
| | {"role": "user", "content": [{"type": "text", "text": input_text}]} |
| | ], |
| | response_format={"type": "json_object"}, |
| | temperature=0, |
| | max_tokens=4096, |
| | top_p=0, |
| | frequency_penalty=0, |
| | presence_penalty=0 |
| | ) |
| | return response.choices[0].message.content |
| |
|
| | def run_gpt4_event_extraction(data_dir, max_tokens=100000): |
| |
|
| | all_info_path = os.path.join(data_dir, "all_info_with_txt.json") |
| | output_dir = os.path.join(data_dir, "gpt4_event_extraction") |
| | os.makedirs(output_dir, exist_ok=True) |
| | icl_path = hf_hub_download( |
| | repo_id="PledgeTracker/demo_feedback", |
| | filename="icl.txt", |
| | repo_type="dataset", |
| | token=os.environ["HF_TOKEN"] |
| | ) |
| | ICL = open(icl_path, "r").read() |
| | all_info = open(all_info_path, "r").readlines() |
| |
|
| | enc = tiktoken.encoding_for_model("gpt-4o") |
| |
|
| | for i, line in enumerate(all_info): |
| | ID = i |
| | urls = [] |
| | results = [] |
| |
|
| | data = json.loads(line) |
| | docs = data["evidence"] |
| | claim = data["claim"] |
| |
|
| | output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json") |
| | if os.path.exists(output_path): |
| | print(f"Already exist: {output_path}") |
| |
|
| | else: |
| |
|
| | for doc in tqdm(docs): |
| | if doc["url"] in urls: |
| | continue |
| |
|
| | text = " ".join(doc["text"]) |
| | input_text = ( |
| | f"{ICL}\nInput:\n\nTitle: {doc['metadata']['title']}\n" |
| | f"Date: {doc['metadata']['date']}\nArticle: {text}\n\n" |
| | f"Please only summarize events that are useful for verifying the claim '{claim}', and their dates in the JSON format.\n\nOutput:\n" |
| | ) |
| |
|
| | urls.append(doc["url"]) |
| | text_tokens = enc.encode(input_text) |
| | if len(text_tokens) > max_tokens: |
| | input_text = enc.decode(text_tokens[:max_tokens]) |
| |
|
| | try: |
| | output = gpt_4o(input_text) |
| | |
| | results.append({ |
| | "url": doc["url"], |
| | "title": doc["metadata"]["title"], |
| | "date": doc["metadata"]["date"], |
| | "article": text, |
| | "output": json.loads(output) |
| | }) |
| | except Exception as e: |
| | print(f"Error processing doc: {e}") |
| | continue |
| |
|
| | |
| | with open(output_path, "w", encoding="utf-8") as f: |
| | json.dump(results, f, ensure_ascii=False, indent=4) |
| |
|
| | return output_path |
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser(description="Run GPT-4o event extraction") |
| | parser.add_argument("--data_dir", type=str, required=True, help="Root data directory") |
| | parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file") |
| | parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input") |
| |
|
| | args = parser.parse_args() |
| |
|
| | run_gpt4_event_extraction( |
| | base_dir=args.base_dir, |
| | icl_path=args.icl_path, |
| | max_tokens=args.max_tokens |
| | ) |
| |
|