| import os.path |
| import json |
| from langchain.docstore.document import Document |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
| class DocProcessor: |
| def __init__(self,LIST_DIR, PATH_SAVE): |
| self.LIST_DIR = LIST_DIR |
| self.PATH_SAVE = PATH_SAVE |
|
|
| def process_data(self): |
| if os.path.exists(self.PATH_SAVE): |
| self.load_docs_from_jsonl() |
| else: |
| self.create_chuncks() |
| self.save_docs_to_jsonl() |
|
|
| self.docs_size = len(self.chunks) |
|
|
|
|
| def create_chuncks(self, nb_char=1000, chunk_overlap=100): |
| data = [] |
| |
| |
| |
| |
| |
|
|
| for path in self.LIST_DIR: |
| if path.endswith(".json"): |
| |
| with open(path, 'r') as f: |
| data += [Document(page_content=str(json.load(f)), metadata={"source": path})] |
|
|
| text_splitter = RecursiveCharacterTextSplitter( |
| chunk_size=nb_char, |
| chunk_overlap=chunk_overlap, |
| add_start_index=True, |
| strip_whitespace=True, |
| separators=["}", "]", "\n\n", "\n", ".", " ", ""], |
| ) |
|
|
| chunks = text_splitter.split_documents(data) |
|
|
| self.chunks = chunks |
| print("Chunks created") |
|
|
| def save_docs_to_jsonl(self): |
| with open(self.PATH_SAVE, 'w') as jsonl_file: |
| for doc in self.chunks: |
| jsonl_file.write(doc.json() + '\n') |
| print("Data saved") |
|
|
| def load_docs_from_jsonl(self): |
| self.chunks = [] |
| with open(self.PATH_SAVE, 'r') as jsonl_file: |
| for line in jsonl_file: |
| data = json.loads(line) |
| obj = Document(**data) |
| self.chunks.append(obj) |
| |
| |
|
|
|
|
| |