Spaces:
Sleeping
Sleeping
Create src/preprocess_caselaw.py
Browse files- src/preprocess_caselaw.py +21 -0
src/preprocess_caselaw.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import json, os
|
| 3 |
+
os.makedirs("data", exist_ok=True)
|
| 4 |
+
|
| 5 |
+
def chunk_text(text, size=1000):
|
| 6 |
+
return [text[i:i+size] for i in range(0, len(text), size)]
|
| 7 |
+
|
| 8 |
+
ds = load_dataset("common-pile/caselaw_access_project", split="train[:0.1%]")
|
| 9 |
+
with open("data/caselaw_chunks.jsonl", "w") as f:
|
| 10 |
+
for item in ds:
|
| 11 |
+
text = item.get("text", "")
|
| 12 |
+
if len(text) < 200:
|
| 13 |
+
continue
|
| 14 |
+
for chunk in chunk_text(text):
|
| 15 |
+
f.write(json.dumps({
|
| 16 |
+
"case_name": item.get("case_name", ""),
|
| 17 |
+
"court": item.get("court", ""),
|
| 18 |
+
"text": chunk
|
| 19 |
+
}) + "\n")
|
| 20 |
+
|
| 21 |
+
print("✅ Preprocessed and saved data/caselaw_chunks.jsonl")
|