Langchain-Chatchat/loader/pdf_loader.py
zhenkaivip dd93837343
使用paddleocr实现 (#342)
* jpg and png ocr

* fix

* write docs to tmp file

* fix

* [BUGFIX] local_doc_qa.py line 172: logging have no end args. (#323)

* image loader

* fix

* fix

* update api.py

* update api.py

* update api.py

* update README.md

* update api.py

* add pdf_loader

* fix

---------

Co-authored-by: RainGather <3255329+RainGather@users.noreply.github.com>
Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
2023-05-13 08:45:17 +08:00

45 lines
1.7 KiB
Python

"""Loader that loads image files."""
from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from paddleocr import PaddleOCR
import os
import fitz
class UnstructuredPaddlePDFLoader(UnstructuredFileLoader):
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
def _get_elements(self) -> List:
def pdf_ocr_txt(filepath, dir_path="tmp_files"):
if not os.path.exists(dir_path):
os.makedirs(dir_path)
filename = os.path.split(filepath)[-1]
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
doc = fitz.open(filepath)
txt_file_path = os.path.join(dir_path, "%s.txt" % (filename))
img_name = './img/.tmp.png'
with open(txt_file_path, 'w', encoding='utf-8') as fout:
for i in range(doc.page_count):
page = doc[i]
text = page.get_text("")
fout.write(text)
fout.write("\n")
img_list = page.get_images()
for img in img_list:
pix = fitz.Pixmap(doc, img[0])
pix.save(img_name)
result = ocr.ocr(img_name)
ocr_result = [i[1][0] for line in result for i in line]
fout.write("\n".join(ocr_result))
os.remove(img_name)
return txt_file_path
txt_file_path = pdf_ocr_txt(self.file_path)
from unstructured.partition.text import partition_text
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)