Question Generator Module¶

pipelines.pipelines.nodes.question_generator.question_generator ¶

QuestionGenerator ¶

Question Generator based on Unimo Text.
Source code in pipelines/pipelines/nodes/question_generator/question_generator.py
class QuestionGenerator(BaseComponent):
    """
    Question Generator based on Unimo Text.
    """

    resource_files_names = {
        "model_state": "model_state.pdparams",
        "model_config": "model_config.json",
        "vocab_file": "vocab.txt",
        "special_tokens_map": "special_tokens_map.json",
        "tokenizer_config": "tokenizer_config.json",
    }

    resource_files_urls = {
        "unimo-text-1.0-question-generator": {
            "model_state": [
                "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/model_state.pdparams",
                "856a2980f83dc227a8fed4ecd730696d",
            ],
            "model_config": [
                "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/model_config.json",
                "b5bab534683d9f0ef82fc84803ee6f3d",
            ],
            "vocab_file": [
                "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/vocab.txt",
                "ea3f8a8cc03937a8df165d2b507c551e",
            ],
            "special_tokens_map": [
                "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/special_tokens_map.json",
                "8b3fb1023167bb4ab9d70708eb05f6ec",
            ],
            "tokenizer_config": [
                "https://bj.bcebos.com/paddlenlp/pipelines/question_generator/unimo-text-1.0-question-generator-v1/tokenizer_config.json",
                "ef261f5d413a46ed1d6f071aed6fb345",
            ],
        },
    }

    return_no_answers: bool
    outgoing_edges = 1
    query_count = 0
    query_time = 0

    def __init__(
        self,
        model="unimo-text-1.0-question-generation",
        task_path=None,
        device="gpu",
        batch_size=16,
        output_scores=True,
        is_select_from_num_return_sequences=False,
        max_length=50,
        decode_strategy="sampling",
        temperature=1.0,
        top_k=5,
        top_p=1.0,
        num_beams=6,
        num_beam_groups=1,
        diversity_rate=0.0,
        num_return_sequences=1,
        template=1,
    ):
        paddle.set_device(device)
        self.model = model
        self._from_taskflow = False
        self._custom_model = False
        if task_path:
            self._task_path = task_path
            self._custom_model = True
        else:
            if model in [
                "unimo-text-1.0",
                "unimo-text-1.0-dureader_qg",
                "unimo-text-1.0-question-generation",
                "unimo-text-1.0-question-generation-dureader_qg",
            ]:
                self._task_path = None
                self._from_taskflow = True
            else:
                self._task_path = os.path.join(PPNLP_HOME, "pipelines", "unsupervised_question_answering", self.model)
                self._check_task_files()
                self.model = "unimo-text-1.0"
        self.num_return_sequences = num_return_sequences
        self.batch_size = batch_size
        if self._from_taskflow:
            self.question_generation = Taskflow(
                "question_generation",
                model=self.model if self._from_taskflow else "unimo-text-1.0",
                output_scores=True,
                max_length=max_length,
                is_select_from_num_return_sequences=is_select_from_num_return_sequences,
                num_return_sequences=num_return_sequences,
                batch_size=batch_size,
                decode_strategy=decode_strategy,
                num_beams=num_beams,
                num_beam_groups=num_beam_groups,
                diversity_rate=diversity_rate,
                top_k=top_k,
                top_p=top_p,
                temperature=temperature,
                template=1,
                device_id=0 if device == "gpu" else -1,
            )
        else:
            self.question_generation = Taskflow(
                "question_generation",
                model=self.model if self._from_taskflow else "unimo-text-1.0",
                task_path=self._task_path,
                output_scores=True,
                max_length=max_length,
                is_select_from_num_return_sequences=is_select_from_num_return_sequences,
                num_return_sequences=num_return_sequences,
                batch_size=batch_size,
                decode_strategy=decode_strategy,
                num_beams=num_beams,
                num_beam_groups=num_beam_groups,
                diversity_rate=diversity_rate,
                top_k=top_k,
                top_p=top_p,
                temperature=temperature,
                template=1,
                device_id=0 if device == "gpu" else -1,
            )

    def _check_task_files(self):
        """
        Check files required by the task.
        """
        for file_id, file_name in self.resource_files_names.items():
            path = os.path.join(self._task_path, file_name)
            url = self.resource_files_urls[self.model][file_id][0]
            md5 = self.resource_files_urls[self.model][file_id][1]

            downloaded = True
            if not os.path.exists(path):
                downloaded = False
            else:
                if not self._custom_model:
                    if os.path.exists(path):
                        # Check whether the file is updated
                        if not md5file(path) == md5:
                            downloaded = False
                            if file_id == "model_state":
                                self._param_updated = True
                    else:
                        downloaded = False
            if not downloaded:
                download_file(self._task_path, file_name, url, md5)

    def create_question(
        self, json_file_or_pair_list, out_json=None, num_return_sequences=1, all_sample_num=None, batch_size=8
    ):
        if out_json:
            wf = open(out_json, "w", encoding="utf-8")
        if isinstance(json_file_or_pair_list, list):
            all_lines = json_file_or_pair_list
        else:
            rf = open(json_file_or_pair_list, "r", encoding="utf-8")
            all_lines = []
            for json_line in rf:
                line_dict = json.loads(json_line)
                all_lines.append(line_dict)
            rf.close()
        num_all_lines = len(all_lines)
        output = []
        context_buffer = []
        answer_buffer = []
        answer_probability_buffer = []
        true_question_buffer = []
        i = 0
        for index, line_dict in enumerate(tqdm(all_lines)):
            if "question" in line_dict:
                q = line_dict["question"]
            else:
                q = ""
            c = line_dict["context"]
            assert "answer_candidates" in line_dict
            answers = line_dict["answer_candidates"]
            if not answers:
                continue
            for j, pair in enumerate(answers):
                a, p = pair
                context_buffer += [c]
                answer_buffer += [a]
                answer_probability_buffer += [p]
                true_question_buffer += [q]
                if (
                    (i + 1) % batch_size == 0
                    or (all_sample_num and (i + 1) == all_sample_num)
                    or ((index + 1) == num_all_lines and j == len(answers) - 1)
                ):
                    result_buffer = self.question_generation(
                        [
                            {"context": context, "answer": answer}
                            for context, answer in zip(context_buffer, answer_buffer)
                        ]
                    )
                    (
                        context_buffer_temp,
                        answer_buffer_temp,
                        answer_probability_buffer_temp,
                        true_question_buffer_temp,
                    ) = ([], [], [], [])
                    for context, answer, answer_probability, true_question in zip(
                        context_buffer, answer_buffer, answer_probability_buffer, true_question_buffer
                    ):
                        context_buffer_temp += [context] * num_return_sequences
                        answer_buffer_temp += [answer] * num_return_sequences
                        answer_probability_buffer_temp += [answer_probability] * num_return_sequences
                        true_question_buffer_temp += [true_question] * num_return_sequences
                    result_one_two_buffer = [(one, two) for one, two in zip(result_buffer[0], result_buffer[1])]
                    for context, answer, answer_probability, true_question, result in zip(
                        context_buffer_temp,
                        answer_buffer_temp,
                        answer_probability_buffer_temp,
                        true_question_buffer_temp,
                        result_one_two_buffer,
                    ):
                        fake_quesitons_tokens = [result[0]]
                        fake_quesitons_scores = [result[1]]
                        for fake_quesitons_token, fake_quesitons_score in zip(
                            fake_quesitons_tokens, fake_quesitons_scores
                        ):
                            out_dict = {
                                "context": context,
                                "synthetic_answer": answer,
                                "synthetic_answer_probability": answer_probability,
                                "synthetic_question": fake_quesitons_token,
                                "synthetic_question_probability": fake_quesitons_score,
                                "true_question": true_question,
                            }
                            if out_json:
                                wf.write(json.dumps(out_dict, ensure_ascii=False) + "\n")
                            output.append(out_dict)
                    context_buffer = []
                    answer_buffer = []
                    true_question_buffer = []
                if all_sample_num and (i + 1) >= all_sample_num:
                    break
                i += 1
        if out_json:
            wf.close()
        return output

    def run(self, ca_pairs):
        print("createing synthetic question-answer pairs...")
        synthetic_context_answer_question_triples = self.create_question(
            ca_pairs, None, self.num_return_sequences, None, self.batch_size
        )
        print("create synthetic question-answer pairs successfully!")
        results = {"cqa_triples": synthetic_context_answer_question_triples}
        return results, "output_1"