mirror of
https://github.com/RYDE-WORK/Langchain-Chatchat.git
synced 2026-01-19 21:37:20 +08:00
* jpg and png ocr * fix * write docs to tmp file * fix * [BUGFIX] local_doc_qa.py line 172: logging have no end args. (#323) * image loader * fix * fix * update api.py * update api.py * update api.py * update README.md * update api.py * add pdf_loader * fix --------- Co-authored-by: RainGather <3255329+RainGather@users.noreply.github.com> Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
29 lines
1.2 KiB
Python
29 lines
1.2 KiB
Python
"""Loader that loads image files."""
|
|
from typing import List
|
|
|
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|
from paddleocr import PaddleOCR
|
|
import os
|
|
|
|
|
|
class UnstructuredPaddleImageLoader(UnstructuredFileLoader):
|
|
"""Loader that uses unstructured to load image files, such as PNGs and JPGs."""
|
|
|
|
def _get_elements(self) -> List:
|
|
def image_ocr_txt(filepath, dir_path="tmp_files"):
|
|
if not os.path.exists(dir_path):
|
|
os.makedirs(dir_path)
|
|
filename = os.path.split(filepath)[-1]
|
|
ocr = PaddleOCR(lang="ch", use_gpu=False, show_log=False)
|
|
result = ocr.ocr(img=filepath)
|
|
|
|
ocr_result = [i[1][0] for line in result for i in line]
|
|
txt_file_path = os.path.join(dir_path, "%s.txt" % (filename))
|
|
with open(txt_file_path, 'w', encoding='utf-8') as fout:
|
|
fout.write("\n".join(ocr_result))
|
|
return txt_file_path
|
|
|
|
txt_file_path = image_ocr_txt(self.file_path)
|
|
from unstructured.partition.text import partition_text
|
|
return partition_text(filename=txt_file_path, **self.unstructured_kwargs)
|