跳转至

Document Intelligence Module

pipelines.pipelines.nodes.document.document_intelligence

DocPrompter

DocPrompter: extract prompt's answers from the document input.

Source code in pipelines/pipelines/nodes/document/document_intelligence.py
class DocPrompter(BaseComponent):
    """
    DocPrompter: extract prompt's answers from the document input.
    """

    return_no_answers: bool
    outgoing_edges = 1
    query_count = 0
    query_time = 0

    def __init__(
        self,
        topn: int = 1,
        use_gpu: bool = True,
        task_path: str = None,
        model: str = "docprompt",
        device_id: int = 0,
        num_threads: int = None,
        lang: str = "ch",
        batch_size: int = 1,
    ):
        """
        Init Document Prompter.
        :param topn: return top n answers.
        :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
        :param task_path: Custom model path if using custom model parameters.
        :param model: Choose model name.
        :param device_id: Choose gpu device id.
        :param num_threads: Number of processing threads.
        :param lang: Choose langugae.
        :param batch_size: Number of samples the model receives in one batch for inference.
                           Memory consumption is much lower in inference mode. Recommendation: Increase the batch size
                           to a value so only a single batch is used.
        """
        self._use_gpu = False if paddle.get_device() == "cpu" else use_gpu
        self.model = model
        self._device_id = device_id
        self._num_threads = num_threads if num_threads else math.ceil(cpu_count() / 2)
        self._topn = topn
        self._lang = lang
        self._batch_size = batch_size
        if task_path is None:
            self._task_path = os.path.join(PPNLP_HOME, "pipelines", "document_intelligence", self.model)
        else:
            self._task_path = task_path

        download_file(self._task_path, "docprompt_params.tar", URLS[self.model][0], URLS[self.model][1])
        self._get_inference_model()
        self._tokenizer = AutoTokenizer.from_pretrained("ernie-layoutx-base-uncased")
        self._reader = ImageReader(super_rel_pos=False, tokenizer=self._tokenizer)

    def _get_inference_model(self):
        inference_model_path = os.path.join(self._task_path, "static", "inference")
        self._static_model_file = inference_model_path + ".pdmodel"
        self._static_params_file = inference_model_path + ".pdiparams"
        self._config = paddle.inference.Config(self._static_model_file, self._static_params_file)
        self._prepare_static_mode()

    def _prepare_static_mode(self):
        """
        Construct the input data and predictor in the PaddlePaddele static mode.
        """
        if paddle.get_device() == "cpu":
            self._config.disable_gpu()
            self._config.enable_mkldnn()
        else:
            self._config.enable_use_gpu(100, self._device_id)
            self._config.delete_pass("embedding_eltwise_layernorm_fuse_pass")
        self._config.set_cpu_math_library_num_threads(self._num_threads)
        self._config.switch_use_feed_fetch_ops(False)
        self._config.disable_glog_info()
        self._config.enable_memory_optim()
        self._config.switch_ir_optim(False)
        self.predictor = paddle.inference.create_predictor(self._config)
        self.input_names = [name for name in self.predictor.get_input_names()]
        self.input_handles = [self.predictor.get_input_handle(name) for name in self.predictor.get_input_names()]
        self.output_handle = [self.predictor.get_output_handle(name) for name in self.predictor.get_output_names()]

    def _run_model(self, inputs: List[dict]):
        """
        Run docprompt model.
        """
        all_predictions_list = []
        for example in inputs:
            ocr_result = example["ocr_result"]
            doc_path = example["doc"]
            prompt = example["prompt"]
            ocr_type = example["ocr_type"]

            if not ocr_result:
                all_predictions = [
                    {"prompt": p, "result": [{"value": "", "prob": 0.0, "start": -1, "end": -1}]} for p in prompt
                ]
                all_boxes = {}
            else:
                data_loader = self._reader.data_generator(ocr_result, doc_path, prompt, self._batch_size, ocr_type)

                RawResult = collections.namedtuple("RawResult", ["unique_id", "seq_logits"])

                all_results = []
                for data in data_loader:
                    for idx in range(len(self.input_names)):
                        self.input_handles[idx].copy_from_cpu(data[idx])
                    self.predictor.run()
                    outputs = [output_handle.copy_to_cpu() for output_handle in self.output_handle]
                    unique_ids, seq_logits = outputs

                    for idx in range(len(unique_ids)):
                        all_results.append(
                            RawResult(
                                unique_id=int(unique_ids[idx]),
                                seq_logits=seq_logits[idx],
                            )
                        )

                all_examples = self._reader.examples["infer"]
                all_features = self._reader.features["infer"]
                all_key_probs = [1 for _ in all_examples]

                example_index_to_features = collections.defaultdict(list)

                for feature in all_features:
                    example_index_to_features[feature.qas_id].append(feature)

                unique_id_to_result = {}
                for result in all_results:
                    unique_id_to_result[result.unique_id] = result

                all_predictions = []
                all_boxes = {}
                for (example_index, example) in enumerate(all_examples):
                    example_doc_tokens = example.doc_tokens
                    example_qas_id = example.qas_id
                    page_id = example_qas_id.split("_")[0]
                    if page_id not in all_boxes:
                        all_boxes[page_id] = example.ori_boxes
                    example_query = example.keys[0]
                    features = example_index_to_features[example_qas_id]

                    preds = []
                    # keep track of the minimum score of null start+end of position 0
                    for feature in features:
                        if feature.unique_id not in unique_id_to_result:
                            continue
                        result = unique_id_to_result[feature.unique_id]

                        # find preds
                        ans_pos = find_answer_pos(result.seq_logits, feature)
                        preds.extend(
                            get_doc_pred(
                                result, ans_pos, example, self._tokenizer, feature, True, all_key_probs, example_index
                            )
                        )

                    if not preds:
                        preds.append({"value": "", "prob": 0.0, "start": -1, "end": -1})
                    else:
                        preds = sort_res(example_query, preds, example_doc_tokens, all_boxes[page_id], self._lang)[
                            : self._topn
                        ]
                    all_predictions.append({"prompt": example_query, "result": preds})
            all_predictions_list.append(all_predictions)
        return all_predictions_list

    def run(self, example: dict):
        results = self._run_model([example])
        output = {"results": results}
        return output, "output_1"

__init__

__init__(topn: int = 1, use_gpu: bool = True, task_path: str = None, model: str = 'docprompt', device_id: int = 0, num_threads: int = None, lang: str = 'ch', batch_size: int = 1)

Init Document Prompter.

Parameters:

Name Type Description Default
topn int

return top n answers.

1
use_gpu bool

Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.

True
task_path str

Custom model path if using custom model parameters.

None
model str

Choose model name.

'docprompt'
device_id int

Choose gpu device id.

0
num_threads int

Number of processing threads.

None
lang str

Choose langugae.

'ch'
batch_size int

Number of samples the model receives in one batch for inference. Memory consumption is much lower in inference mode. Recommendation: Increase the batch size to a value so only a single batch is used.

1
Source code in pipelines/pipelines/nodes/document/document_intelligence.py
def __init__(
    self,
    topn: int = 1,
    use_gpu: bool = True,
    task_path: str = None,
    model: str = "docprompt",
    device_id: int = 0,
    num_threads: int = None,
    lang: str = "ch",
    batch_size: int = 1,
):
    """
    Init Document Prompter.
    :param topn: return top n answers.
    :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
    :param task_path: Custom model path if using custom model parameters.
    :param model: Choose model name.
    :param device_id: Choose gpu device id.
    :param num_threads: Number of processing threads.
    :param lang: Choose langugae.
    :param batch_size: Number of samples the model receives in one batch for inference.
                       Memory consumption is much lower in inference mode. Recommendation: Increase the batch size
                       to a value so only a single batch is used.
    """
    self._use_gpu = False if paddle.get_device() == "cpu" else use_gpu
    self.model = model
    self._device_id = device_id
    self._num_threads = num_threads if num_threads else math.ceil(cpu_count() / 2)
    self._topn = topn
    self._lang = lang
    self._batch_size = batch_size
    if task_path is None:
        self._task_path = os.path.join(PPNLP_HOME, "pipelines", "document_intelligence", self.model)
    else:
        self._task_path = task_path

    download_file(self._task_path, "docprompt_params.tar", URLS[self.model][0], URLS[self.model][1])
    self._get_inference_model()
    self._tokenizer = AutoTokenizer.from_pretrained("ernie-layoutx-base-uncased")
    self._reader = ImageReader(super_rel_pos=False, tokenizer=self._tokenizer)

pipelines.pipelines.nodes.document.document_preprocessor

DocOCRProcessor

Preprocess document input from image/image url/image bytestream to ocr outputs

Source code in pipelines/pipelines/nodes/document/document_preprocessor.py
class DocOCRProcessor(BaseComponent):
    """
    Preprocess document input from image/image url/image bytestream to ocr outputs
    """

    return_no_answers: bool
    outgoing_edges = 1
    query_count = 0
    query_time = 0

    def __init__(self, use_gpu: bool = True, lang: str = "ch"):
        """
        Init Document Preprocessor.
        :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
        :param lang: Choose ocr model processing langugae
        """
        self._lang = lang
        self._use_gpu = False if paddle.get_device() == "cpu" else use_gpu
        self._ocr = PaddleOCR(use_angle_cls=True, show_log=False, use_gpu=self._use_gpu, lang=self._lang)

    def _check_input_text(self, inputs):
        if isinstance(inputs, dict):
            inputs = [inputs]
        if isinstance(inputs, list):
            input_list = []
            for example in inputs:
                data = {}
                if isinstance(example, dict):
                    if "doc" not in example.keys():
                        raise ValueError(
                            "Invalid inputs, the inputs should contain an url to an image or a local path."
                        )
                    else:
                        if isinstance(example["doc"], str):

                            if example["doc"].startswith("http://") or example["doc"].startswith("https://"):
                                download_file("./", example["doc"].rsplit("/", 1)[-1], example["doc"])
                                data["doc"] = example["doc"].rsplit("/", 1)[-1]
                            elif os.path.isfile(example["doc"]):
                                data["doc"] = example["doc"]
                            else:
                                img = base64.b64decode(example["doc"].encode("utf-8"))
                                img = np.frombuffer(bytearray(img), dtype="uint8")
                                img = np.array(Image.open(BytesIO(img)).convert("RGB"))
                                img = Image.fromarray(img)
                                img.save("./tmp.jpg")
                                data["doc"] = "./tmp.jpg"
                        else:
                            raise ValueError("Incorrect path or url, URLs must start with `http://` or `https://`")
                    if "prompt" not in example.keys():
                        raise ValueError("Invalid inputs, the inputs should contain the prompt.")
                    else:
                        if isinstance(example["prompt"], str):
                            data["prompt"] = [example["prompt"]]
                        elif isinstance(example["prompt"], list) and all(
                            isinstance(s, str) for s in example["prompt"]
                        ):
                            data["prompt"] = example["prompt"]
                        else:
                            raise TypeError("Incorrect prompt, prompt should be string or list of string.")
                    if "word_boxes" in example.keys():
                        data["word_boxes"] = example["word_boxes"]
                    input_list.append(data)
                else:
                    raise TypeError(
                        "Invalid inputs, input for document intelligence task should be dict or list of dict, but type of {} found!".format(
                            type(example)
                        )
                    )
        else:
            raise TypeError(
                "Invalid inputs, input for document intelligence task should be dict or list of dict, but type of {} found!".format(
                    type(inputs)
                )
            )
        return input_list

    def run(self, meta: dict):
        example = self._check_input_text(meta)[0]

        if "word_boxes" in example.keys():
            ocr_result = example["word_boxes"]
            example["ocr_type"] = "word_boxes"
        else:
            ocr_result = self._ocr.ocr(example["doc"], cls=True)
            example["ocr_type"] = "ppocr"
            # Compatible with paddleocr>=2.6.0.2
            ocr_result = ocr_result[0] if len(ocr_result) == 1 else ocr_result
        example["ocr_result"] = ocr_result
        output = {"example": example}
        return output, "output_1"

__init__

__init__(use_gpu: bool = True, lang: str = 'ch')

Init Document Preprocessor.

Parameters:

Name Type Description Default
use_gpu bool

Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.

True
lang str

Choose ocr model processing langugae

'ch'
Source code in pipelines/pipelines/nodes/document/document_preprocessor.py
def __init__(self, use_gpu: bool = True, lang: str = "ch"):
    """
    Init Document Preprocessor.
    :param use_gpu: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available.
    :param lang: Choose ocr model processing langugae
    """
    self._lang = lang
    self._use_gpu = False if paddle.get_device() == "cpu" else use_gpu
    self._ocr = PaddleOCR(use_angle_cls=True, show_log=False, use_gpu=self._use_gpu, lang=self._lang)