Spaces:

t-pris
/

Agent_UB

Sleeping

App Files Files Community

Agent_UB / doc_processor.py

t-pris

Upload folder using huggingface_hub

7b295db verified 8 months ago

raw

history blame contribute delete

2.14 kB

	import os.path
	import json
	from langchain.docstore.document import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	class DocProcessor:
	def __init__(self,LIST_DIR, PATH_SAVE):
	self.LIST_DIR = LIST_DIR
	self.PATH_SAVE = PATH_SAVE

	def process_data(self):
	if os.path.exists(self.PATH_SAVE):
	self.load_docs_from_jsonl()
	else:
	self.create_chuncks()
	self.save_docs_to_jsonl()

	self.docs_size = len(self.chunks)


	def create_chuncks(self, nb_char=1000, chunk_overlap=100):
	data = []
	# for filename in os.listdir(self.LIST_DIR):
	# if filename.endswith(".json"):
	# path = os.path.join(self.LIST_DIR, filename)
	# with open(path, 'r') as f:
	# data += [Document(page_content=str(json.load(f)), metadata={"source": path})]

	for path in self.LIST_DIR:
	if path.endswith(".json"):
	# path = os.path.join(self.LIST_DIR, filename)
	with open(path, 'r') as f:
	data += [Document(page_content=str(json.load(f)), metadata={"source": path})]

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=nb_char,
	chunk_overlap=chunk_overlap,
	add_start_index=True,
	strip_whitespace=True,
	separators=["}", "]", "\n\n", "\n", ".", " ", ""],
	)

	chunks = text_splitter.split_documents(data)

	self.chunks = chunks
	print("Chunks created")

	def save_docs_to_jsonl(self):
	with open(self.PATH_SAVE, 'w') as jsonl_file:
	for doc in self.chunks:
	jsonl_file.write(doc.json() + '\n')
	print("Data saved")

	def load_docs_from_jsonl(self):
	self.chunks = []
	with open(self.PATH_SAVE, 'r') as jsonl_file:
	for line in jsonl_file:
	data = json.loads(line)
	obj = Document(**data)
	self.chunks.append(obj)
	# print("Data loaded")
	# return self.chunks