跳转至

File Converter Module

pipelines.pipelines.nodes.file_converter.docx

DocxToTextConverter

Source code in pipelines/pipelines/nodes/file_converter/docx.py
class DocxToTextConverter(BaseConverter):
    def __init__(
        self,
        remove_numeric_tables: bool = False,
        valid_languages: Optional[List[str]] = None,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """

        # Save init parameters to enable export of component config as YAML
        self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

        self.remove_numeric_tables = remove_numeric_tables
        self.valid_languages = valid_languages

        self.desc_path = "parse_files"
        os.makedirs(self.desc_path, exist_ok=True)

    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, Any]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = None,
    ) -> List[Dict[str, Any]]:
        """
        Extract text from a .docx file.
        Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
        For compliance with other converters we nevertheless opted for keeping the methods name.

        :param file_path: Path to the .docx file you want to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Not applicable
        """
        if remove_numeric_tables is None:
            remove_numeric_tables = self.remove_numeric_tables
        if valid_languages is None:
            valid_languages = self.valid_languages
        if remove_numeric_tables is True:
            raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.")
        if valid_languages is True:
            raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.")
        # Creating word reader object.
        file = docx.Document(file_path)
        documents = []
        text_dict = {}
        # This part will parse the docs files with images, the text and the following images will be added as an document
        for i in range(len(file.paragraphs)):
            paragraph = file.paragraphs[i]
            # Extracting images from the paragraph
            image_list = self.get_image_list(file, paragraph)
            # Extracting text from the paragraph
            # If there is text, Adding the text to text_dict
            if paragraph.text != "":
                text = paragraph.text
                if bool(text_dict) is False:
                    text_dict = {"text": [text], "images": []}
                else:
                    text_dict["text"].append(text)
                if image_list is not None:
                    image_names = self.save_images(image_list)
                    text_dict["images"] += image_names
            else:
                # If there are not text and images, adding text_dict to documents
                if image_list is None and bool(text_dict):
                    raw_text = "".join(text_dict["text"])
                    # If the extracted text is "", skip it
                    if raw_text == "":
                        continue
                    meta_data = {}
                    if meta is not None and "name" in meta:
                        meta_data["name"] = meta["name"]
                    meta_data["images"] = text_dict["images"]
                    document = {"content": raw_text, "content_type": "text", "meta": meta_data}
                    documents.append(document)

                    text = paragraph.text
                    text_dict = {"text": [text], "images": []}
                elif image_list is not None:
                    image_names = self.save_images(image_list)
                    text_dict["images"] += image_names
                else:
                    continue
        return documents

    def save_images(self, image_list):
        """
        Save the parsed image into desc_path
        :param image_list: image files from the docx file
        """
        image_names = []
        for i, image in enumerate(image_list):
            if image:
                # File extension & file content
                ext, blob = image.ext, image.blob
                # Using md5 to generate image name and save image into desc_path
                md5hash = hashlib.md5(blob)
                md5_name = md5hash.hexdigest()
                image_name = "{}_{}.{}".format(md5_name, i, ext)
                image_path = os.path.join(self.desc_path, image_name)
                Image.open(BytesIO(blob)).save(image_path)
                # Adding image_name into the text_dict as the image for the text
                image_names.append(image_name)

        return image_names

    def get_image_list(self, document: Document, paragraph: Paragraph):
        """
        Extract images from  paragraph and document object.
        :param document: file objects
        :param paragraph: image paragraph
        """
        result_list = []
        # Looking up the images of the paragraph
        img_list = paragraph._element.xpath(".//pic:pic")
        if len(img_list) == 0 or not img_list:
            return
        # Extracting images from the document
        for i in range(len(img_list)):
            img: CT_Picture = img_list[i]
            embed = img.xpath(".//a:blip/@r:embed")[0]
            related_part: ImagePart = document.part.related_parts[embed]
            image: Image = related_part.image
            result_list.append(image)
        return result_list

__init__

__init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None)

Parameters:

Name Type Description Default
remove_numeric_tables bool

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

False
valid_languages Optional[List[str]]

validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
Source code in pipelines/pipelines/nodes/file_converter/docx.py
def __init__(
    self,
    remove_numeric_tables: bool = False,
    valid_languages: Optional[List[str]] = None,
):
    """
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                            (https://en.wikipedia.org/wiki/ISO_639-1) format.
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    """

    # Save init parameters to enable export of component config as YAML
    self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

    self.remove_numeric_tables = remove_numeric_tables
    self.valid_languages = valid_languages

    self.desc_path = "parse_files"
    os.makedirs(self.desc_path, exist_ok=True)

convert

convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None) -> List[Dict[str, Any]]

Extract text from a .docx file. Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. For compliance with other converters we nevertheless opted for keeping the methods name.

Parameters:

Name Type Description Default
file_path Path

Path to the .docx file you want to convert

required
meta Optional[Dict[str, Any]]

dictionary of meta data key-value pairs to append in the returned document.

None
remove_numeric_tables Optional[bool]

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

None
valid_languages Optional[List[str]]

validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
encoding Optional[str]

Not applicable

None
Source code in pipelines/pipelines/nodes/file_converter/docx.py
def convert(
    self,
    file_path: Path,
    meta: Optional[Dict[str, Any]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    encoding: Optional[str] = None,
) -> List[Dict[str, Any]]:
    """
    Extract text from a .docx file.
    Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here.
    For compliance with other converters we nevertheless opted for keeping the methods name.

    :param file_path: Path to the .docx file you want to convert
    :param meta: dictionary of meta data key-value pairs to append in the returned document.
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                            (https://en.wikipedia.org/wiki/ISO_639-1) format.
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    :param encoding: Not applicable
    """
    if remove_numeric_tables is None:
        remove_numeric_tables = self.remove_numeric_tables
    if valid_languages is None:
        valid_languages = self.valid_languages
    if remove_numeric_tables is True:
        raise Exception("'remove_numeric_tables' is not supported by DocxToTextConverter.")
    if valid_languages is True:
        raise Exception("Language validation using 'valid_languages' is not supported by DocxToTextConverter.")
    # Creating word reader object.
    file = docx.Document(file_path)
    documents = []
    text_dict = {}
    # This part will parse the docs files with images, the text and the following images will be added as an document
    for i in range(len(file.paragraphs)):
        paragraph = file.paragraphs[i]
        # Extracting images from the paragraph
        image_list = self.get_image_list(file, paragraph)
        # Extracting text from the paragraph
        # If there is text, Adding the text to text_dict
        if paragraph.text != "":
            text = paragraph.text
            if bool(text_dict) is False:
                text_dict = {"text": [text], "images": []}
            else:
                text_dict["text"].append(text)
            if image_list is not None:
                image_names = self.save_images(image_list)
                text_dict["images"] += image_names
        else:
            # If there are not text and images, adding text_dict to documents
            if image_list is None and bool(text_dict):
                raw_text = "".join(text_dict["text"])
                # If the extracted text is "", skip it
                if raw_text == "":
                    continue
                meta_data = {}
                if meta is not None and "name" in meta:
                    meta_data["name"] = meta["name"]
                meta_data["images"] = text_dict["images"]
                document = {"content": raw_text, "content_type": "text", "meta": meta_data}
                documents.append(document)

                text = paragraph.text
                text_dict = {"text": [text], "images": []}
            elif image_list is not None:
                image_names = self.save_images(image_list)
                text_dict["images"] += image_names
            else:
                continue
    return documents

get_image_list

get_image_list(document: Document, paragraph: Paragraph)

Extract images from paragraph and document object.

Parameters:

Name Type Description Default
document Document

file objects

required
paragraph Paragraph

image paragraph

required
Source code in pipelines/pipelines/nodes/file_converter/docx.py
def get_image_list(self, document: Document, paragraph: Paragraph):
    """
    Extract images from  paragraph and document object.
    :param document: file objects
    :param paragraph: image paragraph
    """
    result_list = []
    # Looking up the images of the paragraph
    img_list = paragraph._element.xpath(".//pic:pic")
    if len(img_list) == 0 or not img_list:
        return
    # Extracting images from the document
    for i in range(len(img_list)):
        img: CT_Picture = img_list[i]
        embed = img.xpath(".//a:blip/@r:embed")[0]
        related_part: ImagePart = document.part.related_parts[embed]
        image: Image = related_part.image
        result_list.append(image)
    return result_list

save_images

save_images(image_list)

Save the parsed image into desc_path

Parameters:

Name Type Description Default
image_list

image files from the docx file

required
Source code in pipelines/pipelines/nodes/file_converter/docx.py
def save_images(self, image_list):
    """
    Save the parsed image into desc_path
    :param image_list: image files from the docx file
    """
    image_names = []
    for i, image in enumerate(image_list):
        if image:
            # File extension & file content
            ext, blob = image.ext, image.blob
            # Using md5 to generate image name and save image into desc_path
            md5hash = hashlib.md5(blob)
            md5_name = md5hash.hexdigest()
            image_name = "{}_{}.{}".format(md5_name, i, ext)
            image_path = os.path.join(self.desc_path, image_name)
            Image.open(BytesIO(blob)).save(image_path)
            # Adding image_name into the text_dict as the image for the text
            image_names.append(image_name)

    return image_names

DocxTotxtConverter

Source code in pipelines/pipelines/nodes/file_converter/docx.py
class DocxTotxtConverter(BaseConverter):
    def convert(
        self,
        file_path: Path,
        separator="\n",
        **kwargs: Any,
    ) -> List[str]:
        """
        Extract text from a .docx file.
        """
        # Creating word reader object.
        file = docx.Document(file_path)
        txt_documents = ""
        txt_documents = separator.join([i.text for i in file.paragraphs])
        document = {"content": txt_documents, "content_type": "text", "meta": {}}
        return [document]

convert

convert(file_path: Path, separator='\n', **kwargs: Any) -> List[str]

Extract text from a .docx file.

Source code in pipelines/pipelines/nodes/file_converter/docx.py
def convert(
    self,
    file_path: Path,
    separator="\n",
    **kwargs: Any,
) -> List[str]:
    """
    Extract text from a .docx file.
    """
    # Creating word reader object.
    file = docx.Document(file_path)
    txt_documents = ""
    txt_documents = separator.join([i.text for i in file.paragraphs])
    document = {"content": txt_documents, "content_type": "text", "meta": {}}
    return [document]

pipelines.pipelines.nodes.file_converter.image

ImageToTextConverter

Source code in pipelines/pipelines/nodes/file_converter/image.py
class ImageToTextConverter(BaseConverter):
    def __init__(
        self,
        remove_numeric_tables: bool = False,
        valid_languages: Optional[List[str]] = ["eng"],
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified here
                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text. Run the following line of code to check available language packs:
                                # List of available languages
                                print(pytesseract.get_languages(config=''))
        """

        # save init parameters to enable export of component config as YAML
        self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
        use_gpu = True if "gpu" in paddle.device.get_device() else False
        self.recognize = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=use_gpu)

        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, str]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
        **kwargs: Any,
    ) -> List[Dict[str, Any]]:
        """
        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

        :param file_path: path to image file
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                     Can be any custom keys and values.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages supported by tessarect
                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
        pages = self._image_to_text(file_path)
        if remove_numeric_tables is None:
            remove_numeric_tables = self.remove_numeric_tables
        if valid_languages is None:
            valid_languages = self.valid_languages

        cleaned_pages = []
        for page in pages:
            lines = page.splitlines()
            cleaned_lines = []
            for line in lines:
                words = line.split()
                digits = [word for word in words if any(i.isdigit() for i in word)]

                # remove lines having > 40% of words as digits AND not ending with a period(.)
                if remove_numeric_tables:
                    if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                        logger.debug(f"Removing line '{line}' from file")
                        continue
                cleaned_lines.append(line)
            cleaned_pages.append(page)

        if valid_languages:
            document_text = "".join(cleaned_pages)
            if not self.validate_language(document_text, valid_languages):
                logger.warning(
                    f"The language for image is not one of {valid_languages}. The file may not have "
                    f"been decoded in the correct text format."
                )
        documents = []
        for page in cleaned_pages:
            document = {"content": page, "meta": meta}
            documents.append(document)
        return documents

    def _image_to_text(self, img_path) -> List[str]:
        """
        Extract text from image path.

        :param image: input image path
        """
        img_path = str(img_path)
        result = self.recognize.ocr(img_path, cls=True)
        texts = []
        for line in result[0]:
            texts.append(line[-1][0])
        texts = ["".join(texts)]
        return texts

__init__

__init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ['eng'])

Parameters:

Name Type Description Default
remove_numeric_tables bool

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

False
valid_languages Optional[List[str]]

validate languages from a list of languages specified here (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html) This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text. Run the following line of code to check available language packs: # List of available languages print(pytesseract.get_languages(config=''))

['eng']
Source code in pipelines/pipelines/nodes/file_converter/image.py
def __init__(
    self,
    remove_numeric_tables: bool = False,
    valid_languages: Optional[List[str]] = ["eng"],
):
    """
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified here
                            (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html)
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text. Run the following line of code to check available language packs:
                            # List of available languages
                            print(pytesseract.get_languages(config=''))
    """

    # save init parameters to enable export of component config as YAML
    self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
    use_gpu = True if "gpu" in paddle.device.get_device() else False
    self.recognize = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=use_gpu)

    super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

convert

convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = 'utf-8', **kwargs: Any) -> List[Dict[str, Any]]

Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

Parameters:

Name Type Description Default
file_path Path

path to image file

required
meta Optional[Dict[str, str]]

Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values.

None
remove_numeric_tables Optional[bool]

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

None
valid_languages Optional[List[str]]

validate languages from a list of languages supported by tessarect (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
Source code in pipelines/pipelines/nodes/file_converter/image.py
def convert(
    self,
    file_path: Path,
    meta: Optional[Dict[str, str]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    encoding: Optional[str] = "utf-8",
    **kwargs: Any,
) -> List[Dict[str, Any]]:
    """
    Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

    :param file_path: path to image file
    :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                 Can be any custom keys and values.
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages supported by tessarect
                            (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    """
    pages = self._image_to_text(file_path)
    if remove_numeric_tables is None:
        remove_numeric_tables = self.remove_numeric_tables
    if valid_languages is None:
        valid_languages = self.valid_languages

    cleaned_pages = []
    for page in pages:
        lines = page.splitlines()
        cleaned_lines = []
        for line in lines:
            words = line.split()
            digits = [word for word in words if any(i.isdigit() for i in word)]

            # remove lines having > 40% of words as digits AND not ending with a period(.)
            if remove_numeric_tables:
                if words and len(digits) / len(words) > 0.4 and not line.strip().endswith("."):
                    logger.debug(f"Removing line '{line}' from file")
                    continue
            cleaned_lines.append(line)
        cleaned_pages.append(page)

    if valid_languages:
        document_text = "".join(cleaned_pages)
        if not self.validate_language(document_text, valid_languages):
            logger.warning(
                f"The language for image is not one of {valid_languages}. The file may not have "
                f"been decoded in the correct text format."
            )
    documents = []
    for page in cleaned_pages:
        document = {"content": page, "meta": meta}
        documents.append(document)
    return documents

pipelines.pipelines.nodes.file_converter.markdown

MarkdownConverter

Source code in pipelines/pipelines/nodes/file_converter/markdown.py
class MarkdownConverter(BaseConverter):
    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, str]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
    ) -> List[Dict[str, Any]]:
        """
        Reads text from a txt file and executes optional preprocessing steps.

        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        :param encoding: Select the file encoding (default is `utf-8`)
        :param remove_numeric_tables: Not applicable
        :param valid_languages: Not applicable

        :return: Dict of format {"text": "The text from file", "meta": meta}}
        """
        with open(file_path, encoding=encoding, errors="ignore") as f:
            markdown_text = f.read()
        text = self.markdown_to_text(markdown_text)
        document = {"content": text, "content_type": "text", "meta": meta}
        return [document]

    # Following code snippet is copied from https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
    @staticmethod
    def markdown_to_text(markdown_string: str) -> str:
        """
        Converts a markdown string to plaintext

        :param markdown_string: String in markdown format
        """
        # md -> html -> text since BeautifulSoup can extract text cleanly
        html = markdown(markdown_string)

        # remove code snippets
        html = re.sub(r"<pre>(.*?)</pre>", " ", html)
        html = re.sub(r"<code>(.*?)</code >", " ", html)

        # extract text
        soup = BeautifulSoup(html, "html.parser")
        text = "".join(soup.findAll(text=True))
        return text

convert

convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = 'utf-8') -> List[Dict[str, Any]]

Reads text from a txt file and executes optional preprocessing steps.

Parameters:

Name Type Description Default
file_path Path

path of the file to convert

required
meta Optional[Dict[str, str]]

dictionary of meta data key-value pairs to append in the returned document.

None
encoding Optional[str]

Select the file encoding (default is utf-8)

'utf-8'
remove_numeric_tables Optional[bool]

Not applicable

None
valid_languages Optional[List[str]]

Not applicable

None

Returns:

Type Description
List[Dict[str, Any]]

Dict of format {"text": "The text from file", "meta": meta}}

Source code in pipelines/pipelines/nodes/file_converter/markdown.py
def convert(
    self,
    file_path: Path,
    meta: Optional[Dict[str, str]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    encoding: Optional[str] = "utf-8",
) -> List[Dict[str, Any]]:
    """
    Reads text from a txt file and executes optional preprocessing steps.

    :param file_path: path of the file to convert
    :param meta: dictionary of meta data key-value pairs to append in the returned document.
    :param encoding: Select the file encoding (default is `utf-8`)
    :param remove_numeric_tables: Not applicable
    :param valid_languages: Not applicable

    :return: Dict of format {"text": "The text from file", "meta": meta}}
    """
    with open(file_path, encoding=encoding, errors="ignore") as f:
        markdown_text = f.read()
    text = self.markdown_to_text(markdown_text)
    document = {"content": text, "content_type": "text", "meta": meta}
    return [document]

markdown_to_text staticmethod

markdown_to_text(markdown_string: str) -> str

Converts a markdown string to plaintext

Parameters:

Name Type Description Default
markdown_string str

String in markdown format

required
Source code in pipelines/pipelines/nodes/file_converter/markdown.py
@staticmethod
def markdown_to_text(markdown_string: str) -> str:
    """
    Converts a markdown string to plaintext

    :param markdown_string: String in markdown format
    """
    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r"<pre>(.*?)</pre>", " ", html)
    html = re.sub(r"<code>(.*?)</code >", " ", html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = "".join(soup.findAll(text=True))
    return text

MarkdownRawTextConverter

Source code in pipelines/pipelines/nodes/file_converter/markdown.py
class MarkdownRawTextConverter(BaseConverter):
    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, str]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
        **kwargs: Any,
    ) -> List[Dict[str, Any]]:
        """
        Reads text from a txt file and executes optional preprocessing steps.

        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        :param encoding: Select the file encoding (default is `utf-8`)
        :param remove_numeric_tables: Not applicable
        :param valid_languages: Not applicable

        :return: Dict of format {"text": "The text from file", "meta": meta}}
        """
        with open(file_path, encoding=encoding, errors="ignore") as f:
            markdown_text = f.read()
        html = markdown(markdown_text)
        # remove code snippets
        html = re.sub(r"<pre>(.*?)</pre>", " ", html)
        html = re.sub(r"<code>(.*?)</code >", " ", html)
        # 保留标题
        html = re.sub(r"<h1>(.*?)</h1>", "<h1>" + r"# \1" + "</h1>", html)
        html = re.sub(r"<h2>(.*?)</h2>", "<h2>" + r"## \1" + "</h2>", html)
        html = re.sub(r"<h3>(.*?)</h3>", "<h3>" + r"### \1" + "</h3>", html)
        html = re.sub(r"<h4>(.*?)</h4>", "<h4>" + r"#### \1" + "</h4>", html)
        html = re.sub(r"<h5>(.*?)</h5>", "<h5>" + r"##### \1" + "</h5>", html)
        html = re.sub(r"<h6>(.*?)</h6>", "<h6>" + r"###### \1" + "</h6>", html)
        # extract text
        soup = BeautifulSoup(html, "html.parser")
        markdown_text = "".join(soup.findAll(text=True))
        document = {"content": markdown_text, "content_type": "text", "meta": meta}
        return [document]

convert

convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = 'utf-8', **kwargs: Any) -> List[Dict[str, Any]]

Reads text from a txt file and executes optional preprocessing steps.

Parameters:

Name Type Description Default
file_path Path

path of the file to convert

required
meta Optional[Dict[str, str]]

dictionary of meta data key-value pairs to append in the returned document.

None
encoding Optional[str]

Select the file encoding (default is utf-8)

'utf-8'
remove_numeric_tables Optional[bool]

Not applicable

None
valid_languages Optional[List[str]]

Not applicable

None

Returns:

Type Description
List[Dict[str, Any]]

Dict of format {"text": "The text from file", "meta": meta}}

Source code in pipelines/pipelines/nodes/file_converter/markdown.py
def convert(
    self,
    file_path: Path,
    meta: Optional[Dict[str, str]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    encoding: Optional[str] = "utf-8",
    **kwargs: Any,
) -> List[Dict[str, Any]]:
    """
    Reads text from a txt file and executes optional preprocessing steps.

    :param file_path: path of the file to convert
    :param meta: dictionary of meta data key-value pairs to append in the returned document.
    :param encoding: Select the file encoding (default is `utf-8`)
    :param remove_numeric_tables: Not applicable
    :param valid_languages: Not applicable

    :return: Dict of format {"text": "The text from file", "meta": meta}}
    """
    with open(file_path, encoding=encoding, errors="ignore") as f:
        markdown_text = f.read()
    html = markdown(markdown_text)
    # remove code snippets
    html = re.sub(r"<pre>(.*?)</pre>", " ", html)
    html = re.sub(r"<code>(.*?)</code >", " ", html)
    # 保留标题
    html = re.sub(r"<h1>(.*?)</h1>", "<h1>" + r"# \1" + "</h1>", html)
    html = re.sub(r"<h2>(.*?)</h2>", "<h2>" + r"## \1" + "</h2>", html)
    html = re.sub(r"<h3>(.*?)</h3>", "<h3>" + r"### \1" + "</h3>", html)
    html = re.sub(r"<h4>(.*?)</h4>", "<h4>" + r"#### \1" + "</h4>", html)
    html = re.sub(r"<h5>(.*?)</h5>", "<h5>" + r"##### \1" + "</h5>", html)
    html = re.sub(r"<h6>(.*?)</h6>", "<h6>" + r"###### \1" + "</h6>", html)
    # extract text
    soup = BeautifulSoup(html, "html.parser")
    markdown_text = "".join(soup.findAll(text=True))
    document = {"content": markdown_text, "content_type": "text", "meta": meta}
    return [document]

pipelines.pipelines.nodes.file_converter.pdf

PDFToTextConverter

Source code in pipelines/pipelines/nodes/file_converter/pdf.py
class PDFToTextConverter(BaseConverter):
    def __init__(
        self,
        remove_numeric_tables: bool = False,
        language: str = "en",
        valid_languages: Optional[List[str]] = None,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
        # save init parameters to enable export of component config as YAML
        self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
        self.language = language

    def convert(
        self,
        file_path: Path,
        process_num: int = 20,
        meta: Optional[Dict[str, str]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        language: Optional[str] = "en",
        **kwargs: Any,
    ) -> List[Dict[str, Any]]:
        """
        Extract text from a .pdf file using the pypdf library (https://pybrary.net/pyPdf/)

        :param file_path: Path to the .pdf file you want to convert
        :param process_num: Number of processes
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                     Can be any custom keys and values.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
        pages = self._read_pdf(file_path, process_num=process_num)
        documents = []
        for page in pages:
            document = {"content": page, "content_type": "text", "meta": meta}
            documents.append(document)
        return documents

    def _read_pdf(self, file_path: Path, process_num: int) -> List[str]:
        """
        Extract pages from the pdf file at file_path.

        :param file_path: path of the pdf file
        :param layout: whether to retain the original physical layout for a page. If disabled, PDF pages are read in
                       the content stream order.
        ::param process_num: Number of processes
        """
        if process_num > os.cpu_count():
            logger.warning("The number of processes cannot exceed the number of cups")
            process_num = os.cpu_count()
        pdf = pypdf.PdfReader(file_path)
        page_length = len(pdf.pages)
        split_len = page_length // process_num
        if split_len == 0:
            split_len = page_length
        page_list = [i for i in range(0, page_length, split_len)]
        if page_length > page_list[-1]:
            page_list.append(page_length)
        page_combination = [(start, end) for start, end in zip(page_list, page_list[1:])]
        page_text = run_process(page_combination, file_path, process_num)
        page_text_all = []
        for item in page_text:
            page_text_all.extend(item)
        return page_text_all

__init__

__init__(remove_numeric_tables: bool = False, language: str = 'en', valid_languages: Optional[List[str]] = None)

Parameters:

Name Type Description Default
remove_numeric_tables bool

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

False
valid_languages Optional[List[str]]

validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
Source code in pipelines/pipelines/nodes/file_converter/pdf.py
def __init__(
    self,
    remove_numeric_tables: bool = False,
    language: str = "en",
    valid_languages: Optional[List[str]] = None,
):
    """
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                            (https://en.wikipedia.org/wiki/ISO_639-1) format.
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    """
    # save init parameters to enable export of component config as YAML
    self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

    super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
    self.language = language

convert

convert(file_path: Path, process_num: int = 20, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, language: Optional[str] = 'en', **kwargs: Any) -> List[Dict[str, Any]]

Extract text from a .pdf file using the pypdf library (https://pybrary.net/pyPdf/)

Parameters:

Name Type Description Default
file_path Path

Path to the .pdf file you want to convert

required
process_num int

Number of processes

20
meta Optional[Dict[str, str]]

Optional dictionary with metadata that shall be attached to all resulting documents. Can be any custom keys and values.

None
remove_numeric_tables Optional[bool]

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

None
valid_languages Optional[List[str]]

validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
Source code in pipelines/pipelines/nodes/file_converter/pdf.py
def convert(
    self,
    file_path: Path,
    process_num: int = 20,
    meta: Optional[Dict[str, str]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    language: Optional[str] = "en",
    **kwargs: Any,
) -> List[Dict[str, Any]]:
    """
    Extract text from a .pdf file using the pypdf library (https://pybrary.net/pyPdf/)

    :param file_path: Path to the .pdf file you want to convert
    :param process_num: Number of processes
    :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                 Can be any custom keys and values.
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                            (https://en.wikipedia.org/wiki/ISO_639-1) format.
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    """
    pages = self._read_pdf(file_path, process_num=process_num)
    documents = []
    for page in pages:
        document = {"content": page, "content_type": "text", "meta": meta}
        documents.append(document)
    return documents

PDFToTextOCRConverter

Source code in pipelines/pipelines/nodes/file_converter/pdf.py
class PDFToTextOCRConverter(BaseConverter):
    def __init__(
        self,
        remove_numeric_tables: bool = False,
        valid_languages: Optional[List[str]] = ["eng"],
    ):
        """
        Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages supported by tessarect
                                (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        """
        # init image to text instance
        self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)

        # save init parameters to enable export of component config as YAML
        self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
        super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, str]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
    ) -> List[Dict[str, Any]]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.

        File converters may extract file meta like name or size. In addition to it, user
        supplied meta data like author, url, external IDs can be supplied as a dictionary.

        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Select the file encoding (default is `utf-8`)
        """
        pages = []
        try:
            images = convert_from_path(file_path)
            for image in images:
                temp_img = tempfile.NamedTemporaryFile(
                    dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg", delete=False
                )
                image.save(temp_img.name)
                pages.append(self.image_2_text.convert(temp_img.name)[0]["content"])
                temp_img.close()
                os.remove(temp_img.name)
        except Exception as exception:
            logger.error(f"File {file_path} has an error \n {exception}")

        raw_text = "\f".join(pages)
        document = {"content": raw_text, "meta": meta}

        return [document]

__init__

__init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ['eng'])

Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

Parameters:

Name Type Description Default
remove_numeric_tables bool

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

False
valid_languages Optional[List[str]]

validate languages from a list of languages supported by tessarect (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

['eng']
Source code in pipelines/pipelines/nodes/file_converter/pdf.py
def __init__(
    self,
    remove_numeric_tables: bool = False,
    valid_languages: Optional[List[str]] = ["eng"],
):
    """
    Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract)

    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages supported by tessarect
                            (https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html).
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    """
    # init image to text instance
    self.image_2_text = ImageToTextConverter(remove_numeric_tables, valid_languages)

    # save init parameters to enable export of component config as YAML
    self.set_config(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)
    super().__init__(remove_numeric_tables=remove_numeric_tables, valid_languages=valid_languages)

convert

convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = 'utf-8') -> List[Dict[str, Any]]

Convert a file to a dictionary containing the text and any associated meta data.

File converters may extract file meta like name or size. In addition to it, user supplied meta data like author, url, external IDs can be supplied as a dictionary.

Parameters:

Name Type Description Default
file_path Path

path of the file to convert

required
meta Optional[Dict[str, str]]

dictionary of meta data key-value pairs to append in the returned document.

None
remove_numeric_tables Optional[bool]

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

None
valid_languages Optional[List[str]]

validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
encoding Optional[str]

Select the file encoding (default is utf-8)

'utf-8'
Source code in pipelines/pipelines/nodes/file_converter/pdf.py
def convert(
    self,
    file_path: Path,
    meta: Optional[Dict[str, str]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    encoding: Optional[str] = "utf-8",
) -> List[Dict[str, Any]]:
    """
    Convert a file to a dictionary containing the text and any associated meta data.

    File converters may extract file meta like name or size. In addition to it, user
    supplied meta data like author, url, external IDs can be supplied as a dictionary.

    :param file_path: path of the file to convert
    :param meta: dictionary of meta data key-value pairs to append in the returned document.
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                            (https://en.wikipedia.org/wiki/ISO_639-1) format.
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    :param encoding: Select the file encoding (default is `utf-8`)
    """
    pages = []
    try:
        images = convert_from_path(file_path)
        for image in images:
            temp_img = tempfile.NamedTemporaryFile(
                dir=os.path.dirname(os.path.realpath(__file__)), suffix=".jpeg", delete=False
            )
            image.save(temp_img.name)
            pages.append(self.image_2_text.convert(temp_img.name)[0]["content"])
            temp_img.close()
            os.remove(temp_img.name)
    except Exception as exception:
        logger.error(f"File {file_path} has an error \n {exception}")

    raw_text = "\f".join(pages)
    document = {"content": raw_text, "meta": meta}

    return [document]

pipelines.pipelines.nodes.file_converter.txt

TextConverter

Source code in pipelines/pipelines/nodes/file_converter/txt.py
class TextConverter(BaseConverter):
    def convert(
        self,
        file_path: Path,
        meta: Optional[Dict[str, str]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "utf-8",
        **kwargs: Any,
    ) -> List[Dict[str, Any]]:
        """
        Reads text from a txt file and executes optional preprocessing steps.

        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Select the file encoding (default is `utf-8`)

        :return: Dict of format {"text": "The text from file", "meta": meta}}

        """
        if remove_numeric_tables is None:
            remove_numeric_tables = self.remove_numeric_tables
        if valid_languages is None:
            valid_languages = self.valid_languages

        with open(file_path, encoding=encoding, errors="ignore") as f:
            text = f.read()

        documents = []
        document = {"content": text, "content_type": "text", "meta": meta}
        documents.append(document)
        return documents

convert

convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = 'utf-8', **kwargs: Any) -> List[Dict[str, Any]]

Reads text from a txt file and executes optional preprocessing steps.

Parameters:

Name Type Description Default
file_path Path

path of the file to convert

required
meta Optional[Dict[str, str]]

dictionary of meta data key-value pairs to append in the returned document.

None
remove_numeric_tables Optional[bool]

This option uses heuristics to remove numeric rows from the tables. The tabular structures in documents might be noise for the reader model if it does not have table parsing capability for finding answers. However, tables may also have long strings that could possible candidate for searching answers. The rows containing strings are thus retained in this option.

None
valid_languages Optional[List[str]]

validate languages from a list of languages specified in the ISO 639-1 (https://en.wikipedia.org/wiki/ISO_639-1) format. This option can be used to add test for encoding errors. If the extracted text is not one of the valid languages, then it might likely be encoding error resulting in garbled text.

None
encoding Optional[str]

Select the file encoding (default is utf-8)

'utf-8'

Returns:

Type Description
List[Dict[str, Any]]

Dict of format {"text": "The text from file", "meta": meta}}

Source code in pipelines/pipelines/nodes/file_converter/txt.py
def convert(
    self,
    file_path: Path,
    meta: Optional[Dict[str, str]] = None,
    remove_numeric_tables: Optional[bool] = None,
    valid_languages: Optional[List[str]] = None,
    encoding: Optional[str] = "utf-8",
    **kwargs: Any,
) -> List[Dict[str, Any]]:
    """
    Reads text from a txt file and executes optional preprocessing steps.

    :param file_path: path of the file to convert
    :param meta: dictionary of meta data key-value pairs to append in the returned document.
    :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                  The tabular structures in documents might be noise for the reader model if it
                                  does not have table parsing capability for finding answers. However, tables
                                  may also have long strings that could possible candidate for searching answers.
                                  The rows containing strings are thus retained in this option.
    :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                            (https://en.wikipedia.org/wiki/ISO_639-1) format.
                            This option can be used to add test for encoding errors. If the extracted text is
                            not one of the valid languages, then it might likely be encoding error resulting
                            in garbled text.
    :param encoding: Select the file encoding (default is `utf-8`)

    :return: Dict of format {"text": "The text from file", "meta": meta}}

    """
    if remove_numeric_tables is None:
        remove_numeric_tables = self.remove_numeric_tables
    if valid_languages is None:
        valid_languages = self.valid_languages

    with open(file_path, encoding=encoding, errors="ignore") as f:
        text = f.read()

    documents = []
    document = {"content": text, "content_type": "text", "meta": meta}
    documents.append(document)
    return documents