siyagajbhe commited on
Commit
0f37de1
·
verified ·
1 Parent(s): a32d077

Create src/preprocess_caselaw.py

Browse files
Files changed (1) hide show
  1. src/preprocess_caselaw.py +21 -0
src/preprocess_caselaw.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import json, os
3
+ os.makedirs("data", exist_ok=True)
4
+
5
+ def chunk_text(text, size=1000):
6
+ return [text[i:i+size] for i in range(0, len(text), size)]
7
+
8
+ ds = load_dataset("common-pile/caselaw_access_project", split="train[:0.1%]")
9
+ with open("data/caselaw_chunks.jsonl", "w") as f:
10
+ for item in ds:
11
+ text = item.get("text", "")
12
+ if len(text) < 200:
13
+ continue
14
+ for chunk in chunk_text(text):
15
+ f.write(json.dumps({
16
+ "case_name": item.get("case_name", ""),
17
+ "court": item.get("court", ""),
18
+ "text": chunk
19
+ }) + "\n")
20
+
21
+ print("✅ Preprocessed and saved data/caselaw_chunks.jsonl")