Search Engine Module¶

pipelines.pipelines.nodes.search_engine.providers ¶

SearchApi ¶

SearchApi is a real-time search engine that provides an API to access search results from Google, Google Scholar, YouTube, YouTube transcripts and more. See the SearchApi website for more details.

Source code in pipelines/pipelines/nodes/search_engine/providers.py

class SearchApi(SearchEngine):
    """
    SearchApi is a real-time search engine that provides an API to access search results from Google, Google Scholar, YouTube,
    YouTube transcripts and more. See the [SearchApi website](https://www.searchapi.io/) for more details.
    """

    def __init__(
        self,
        api_key: str,
        top_k: Optional[int] = 10,
        engine: Optional[str] = "google",
        search_engine_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """
        :param api_key: API key for SearchApi.
        :param top_k: Number of results to return.
        :param engine: Search engine to use, for example google, google_scholar, youtube, youtube_transcripts.
        See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported engines.
        :param search_engine_kwargs: Additional parameters passed to the SearchApi.
        See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported parameters.
        """
        super().__init__()
        self.params_dict: Dict[str, Union[str, int, float]] = {}
        self.api_key = api_key
        self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
        self.engine = engine
        self.top_k = top_k

    def search(self, query: str, **kwargs) -> List[Document]:
        """
        :param query: Query string.
        :param kwargs: Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States'
        to localize search to the specific location.
        :return: List[Document]
        """
        kwargs = {**self.kwargs, **kwargs}
        top_k = kwargs.pop("top_k", self.top_k)
        url = "https://www.searchapi.io/api/v1/search"

        params = {"q": query, **kwargs}
        headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "PaddleNLP"}

        if self.engine:
            params["engine"] = self.engine
        response = requests.get(url, params=params, headers=headers, timeout=90)

        if response.status_code != 200:
            raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

        json_content = json.loads(response.text)
        documents = []
        has_answer_box = False

        if json_content.get("answer_box"):
            if json_content["answer_box"].get("organic_result"):
                title = json_content["answer_box"].get("organic_result").get("title", "")
                link = json_content["answer_box"].get("organic_result").get("link", "")
            if json_content["answer_box"].get("type") == "population_graph":
                title = json_content["answer_box"].get("place", "")
                link = json_content["answer_box"].get("explore_more_link", "")

            title = json_content["answer_box"].get("title", "")
            link = json_content["answer_box"].get("link")
            content = json_content["answer_box"].get("answer") or json_content["answer_box"].get("snippet")

            if link and content:
                has_answer_box = True
                documents.append(Document.from_dict({"title": title, "content": content, "link": link}))

        if json_content.get("knowledge_graph"):
            if json_content["knowledge_graph"].get("source"):
                link = json_content["knowledge_graph"].get("source").get("link", "")

            link = json_content["knowledge_graph"].get("website", "")
            content = json_content["knowledge_graph"].get("description")

            if link and content:
                documents.append(
                    Document.from_dict(
                        {"title": json_content["knowledge_graph"].get("title", ""), "content": content, "link": link}
                    )
                )

        documents += [
            Document.from_dict({"title": c["title"], "content": c.get("snippet", ""), "link": c["link"]})
            for c in json_content["organic_results"]
        ]

        if json_content.get("related_questions"):
            for question in json_content["related_questions"]:
                if question.get("source"):
                    link = question.get("source").get("link", "")
                else:
                    link = ""

                content = question.get("answer", "")

                if link and content:
                    documents.append(
                        Document.from_dict({"title": question.get("question", ""), "content": content, "link": link})
                    )

        logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
        result_docs = documents[:top_k]
        return self.score_results(result_docs, has_answer_box)

init ¶

__init__(api_key: str, top_k: Optional[int] = 10, engine: Optional[str] = 'google', search_engine_kwargs: Optional[Dict[str, Any]] = None)

Parameters:

Name	Type	Description	Default
`api_key`	`str`	API key for SearchApi.	required
`top_k`	`Optional[int]`	Number of results to return.	`10`
`engine`	`Optional[str]`	Search engine to use, for example google, google_scholar, youtube, youtube_transcripts. See the SearchApi documentation for the full list of supported engines.	`'google'`
`search_engine_kwargs`	`Optional[Dict[str, Any]]`	Additional parameters passed to the SearchApi. See the SearchApi documentation for the full list of supported parameters.	`None`

Source code in pipelines/pipelines/nodes/search_engine/providers.py

def __init__(
    self,
    api_key: str,
    top_k: Optional[int] = 10,
    engine: Optional[str] = "google",
    search_engine_kwargs: Optional[Dict[str, Any]] = None,
):
    """
    :param api_key: API key for SearchApi.
    :param top_k: Number of results to return.
    :param engine: Search engine to use, for example google, google_scholar, youtube, youtube_transcripts.
    See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported engines.
    :param search_engine_kwargs: Additional parameters passed to the SearchApi.
    See the [SearchApi documentation](https://www.searchapi.io/docs/google) for the full list of supported parameters.
    """
    super().__init__()
    self.params_dict: Dict[str, Union[str, int, float]] = {}
    self.api_key = api_key
    self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
    self.engine = engine
    self.top_k = top_k

search ¶

search(query: str, **kwargs) -> List[Document]

Parameters:

Name	Type	Description	Default
`query`	`str`	Query string.	required
`kwargs`		Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States' to localize search to the specific location.	`{}`

Returns:

Type	Description
`List[Document]`	List[Document]

Source code in pipelines/pipelines/nodes/search_engine/providers.py

def search(self, query: str, **kwargs) -> List[Document]:
    """
    :param query: Query string.
    :param kwargs: Additional parameters passed to the SearchApi. For example, you can set 'location' to 'New York,United States'
    to localize search to the specific location.
    :return: List[Document]
    """
    kwargs = {**self.kwargs, **kwargs}
    top_k = kwargs.pop("top_k", self.top_k)
    url = "https://www.searchapi.io/api/v1/search"

    params = {"q": query, **kwargs}
    headers = {"Authorization": f"Bearer {self.api_key}", "X-SearchApi-Source": "PaddleNLP"}

    if self.engine:
        params["engine"] = self.engine
    response = requests.get(url, params=params, headers=headers, timeout=90)

    if response.status_code != 200:
        raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

    json_content = json.loads(response.text)
    documents = []
    has_answer_box = False

    if json_content.get("answer_box"):
        if json_content["answer_box"].get("organic_result"):
            title = json_content["answer_box"].get("organic_result").get("title", "")
            link = json_content["answer_box"].get("organic_result").get("link", "")
        if json_content["answer_box"].get("type") == "population_graph":
            title = json_content["answer_box"].get("place", "")
            link = json_content["answer_box"].get("explore_more_link", "")

        title = json_content["answer_box"].get("title", "")
        link = json_content["answer_box"].get("link")
        content = json_content["answer_box"].get("answer") or json_content["answer_box"].get("snippet")

        if link and content:
            has_answer_box = True
            documents.append(Document.from_dict({"title": title, "content": content, "link": link}))

    if json_content.get("knowledge_graph"):
        if json_content["knowledge_graph"].get("source"):
            link = json_content["knowledge_graph"].get("source").get("link", "")

        link = json_content["knowledge_graph"].get("website", "")
        content = json_content["knowledge_graph"].get("description")

        if link and content:
            documents.append(
                Document.from_dict(
                    {"title": json_content["knowledge_graph"].get("title", ""), "content": content, "link": link}
                )
            )

    documents += [
        Document.from_dict({"title": c["title"], "content": c.get("snippet", ""), "link": c["link"]})
        for c in json_content["organic_results"]
    ]

    if json_content.get("related_questions"):
        for question in json_content["related_questions"]:
            if question.get("source"):
                link = question.get("source").get("link", "")
            else:
                link = ""

            content = question.get("answer", "")

            if link and content:
                documents.append(
                    Document.from_dict({"title": question.get("question", ""), "content": content, "link": link})
                )

    logger.debug("SearchApi returned %s documents for the query '%s'", len(documents), query)
    result_docs = documents[:top_k]
    return self.score_results(result_docs, has_answer_box)

SerpAPI ¶

SerpAPI is a search engine that provides a REST API to access search results from Google, Bing, Yahoo, Yandex, Amazon, and similar. See the SerpAPI website for more details.

Source code in pipelines/pipelines/nodes/search_engine/providers.py

class SerpAPI(SearchEngine):
    """
    SerpAPI is a search engine that provides a REST API to access search results from Google, Bing, Yahoo, Yandex,
    Amazon, and similar. See the [SerpAPI website](https://serpapi.com/) for more details.
    """

    def __init__(
        self,
        api_key: str,
        top_k: Optional[int] = 10,
        engine: Optional[str] = "google",
        search_engine_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """
        :param api_key: API key for SerpAPI.
        :param top_k: Number of results to return.
        :param engine: Search engine to use, for example google, bing, baidu, duckduckgo, yahoo, yandex.
        See the [SerpAPI documentation](https://serpapi.com/search-api) for the full list of supported engines.
        :param search_engine_kwargs: Additional parameters passed to the SerperDev API. For example, you can set 'lr' to 'lang_en'
        to limit the search to English.
        See the [SerpAPI documentation](https://serpapi.com/search-api) for the full list of supported parameters.
        """
        super().__init__()
        self.params_dict: Dict[str, Union[str, int, float]] = {}
        self.api_key = api_key
        self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
        self.engine = engine
        self.top_k = top_k

    def search(self, query: str, **kwargs) -> List[Document]:
        """
        :param query: Query string.
        :param kwargs: Additional parameters passed to the SerpAPI. For example, you can set 'lr' to 'lang_en'
        to limit the search to English.
        See the [SerpAPI documentation](https://serpapi.com/search-api) for the full list of supported parameters.
        :return: List[Document]
        """
        kwargs = {**self.kwargs, **kwargs}
        top_k = kwargs.pop("top_k", self.top_k)
        url = "https://serpapi.com/search"

        params = {"source": "python", "serp_api_key": self.api_key, "q": query, **kwargs}

        if self.engine:
            params["engine"] = self.engine
        response = requests.get(url, params, timeout=30)

        if response.status_code != 200:
            raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

        json_result = json.loads(response.text)
        organic = [
            Document.from_dict(d, field_map={"snippet": "content"})
            for d in json_result["organic_results"]
            if "snippet" in d
        ]
        answer_box = []
        if "answer_box" in json_result:
            answer_dict = json_result["answer_box"]
            for key in ["answer", "snippet_highlighted_words", "snippet", "title"]:
                if key in answer_dict:
                    answer_box_content = answer_dict[key]
                    if isinstance(answer_box_content, list):
                        answer_box_content = answer_box_content[0]
                    answer_box = [
                        Document.from_dict(
                            {
                                "title": answer_dict.get("title", ""),
                                "content": answer_box_content,
                                "link": answer_dict.get("displayed_link", ""),
                            }
                        )
                    ]
                    break

        people_also_search = []
        if "people_also_search_for" in json_result:
            for result in json_result["people_also_search_for"]:
                people_also_search.append(
                    Document.from_dict(
                        {
                            "title": result["title"],
                            "content": result["snippet"] if result.get("snippet") else result["title"],
                            "link": result["link"],
                        }
                    )
                )

        related_questions = []
        if "related_questions" in json_result:
            for result in json_result["related_questions"]:
                related_questions.append(
                    Document.from_dict(
                        {
                            "title": result["title"],
                            "content": result["snippet"] if result.get("snippet") else result["title"],
                            "link": result["link"],
                        }
                    )
                )

        documents = answer_box + organic + people_also_search + related_questions

        logger.debug("SerpAPI returned %s documents for the query '%s'", len(documents), query)
        result_docs = documents[:top_k]
        return self.score_results(result_docs, len(answer_box) > 0)

init ¶

__init__(api_key: str, top_k: Optional[int] = 10, engine: Optional[str] = 'google', search_engine_kwargs: Optional[Dict[str, Any]] = None)

Parameters:

Name	Type	Description	Default
`api_key`	`str`	API key for SerpAPI.	required
`top_k`	`Optional[int]`	Number of results to return.	`10`
`engine`	`Optional[str]`	Search engine to use, for example google, bing, baidu, duckduckgo, yahoo, yandex. See the SerpAPI documentation for the full list of supported engines.	`'google'`
`search_engine_kwargs`	`Optional[Dict[str, Any]]`	Additional parameters passed to the SerperDev API. For example, you can set 'lr' to 'lang_en' to limit the search to English. See the SerpAPI documentation for the full list of supported parameters.	`None`

Source code in pipelines/pipelines/nodes/search_engine/providers.py

def __init__(
    self,
    api_key: str,
    top_k: Optional[int] = 10,
    engine: Optional[str] = "google",
    search_engine_kwargs: Optional[Dict[str, Any]] = None,
):
    """
    :param api_key: API key for SerpAPI.
    :param top_k: Number of results to return.
    :param engine: Search engine to use, for example google, bing, baidu, duckduckgo, yahoo, yandex.
    See the [SerpAPI documentation](https://serpapi.com/search-api) for the full list of supported engines.
    :param search_engine_kwargs: Additional parameters passed to the SerperDev API. For example, you can set 'lr' to 'lang_en'
    to limit the search to English.
    See the [SerpAPI documentation](https://serpapi.com/search-api) for the full list of supported parameters.
    """
    super().__init__()
    self.params_dict: Dict[str, Union[str, int, float]] = {}
    self.api_key = api_key
    self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
    self.engine = engine
    self.top_k = top_k

search ¶

search(query: str, **kwargs) -> List[Document]

Parameters:

Name	Type	Description	Default
`query`	`str`	Query string.	required
`kwargs`		Additional parameters passed to the SerpAPI. For example, you can set 'lr' to 'lang_en' to limit the search to English. See the SerpAPI documentation for the full list of supported parameters.	`{}`

Returns:

Type	Description
`List[Document]`	List[Document]

Source code in pipelines/pipelines/nodes/search_engine/providers.py

def search(self, query: str, **kwargs) -> List[Document]:
    """
    :param query: Query string.
    :param kwargs: Additional parameters passed to the SerpAPI. For example, you can set 'lr' to 'lang_en'
    to limit the search to English.
    See the [SerpAPI documentation](https://serpapi.com/search-api) for the full list of supported parameters.
    :return: List[Document]
    """
    kwargs = {**self.kwargs, **kwargs}
    top_k = kwargs.pop("top_k", self.top_k)
    url = "https://serpapi.com/search"

    params = {"source": "python", "serp_api_key": self.api_key, "q": query, **kwargs}

    if self.engine:
        params["engine"] = self.engine
    response = requests.get(url, params, timeout=30)

    if response.status_code != 200:
        raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

    json_result = json.loads(response.text)
    organic = [
        Document.from_dict(d, field_map={"snippet": "content"})
        for d in json_result["organic_results"]
        if "snippet" in d
    ]
    answer_box = []
    if "answer_box" in json_result:
        answer_dict = json_result["answer_box"]
        for key in ["answer", "snippet_highlighted_words", "snippet", "title"]:
            if key in answer_dict:
                answer_box_content = answer_dict[key]
                if isinstance(answer_box_content, list):
                    answer_box_content = answer_box_content[0]
                answer_box = [
                    Document.from_dict(
                        {
                            "title": answer_dict.get("title", ""),
                            "content": answer_box_content,
                            "link": answer_dict.get("displayed_link", ""),
                        }
                    )
                ]
                break

    people_also_search = []
    if "people_also_search_for" in json_result:
        for result in json_result["people_also_search_for"]:
            people_also_search.append(
                Document.from_dict(
                    {
                        "title": result["title"],
                        "content": result["snippet"] if result.get("snippet") else result["title"],
                        "link": result["link"],
                    }
                )
            )

    related_questions = []
    if "related_questions" in json_result:
        for result in json_result["related_questions"]:
            related_questions.append(
                Document.from_dict(
                    {
                        "title": result["title"],
                        "content": result["snippet"] if result.get("snippet") else result["title"],
                        "link": result["link"],
                    }
                )
            )

    documents = answer_box + organic + people_also_search + related_questions

    logger.debug("SerpAPI returned %s documents for the query '%s'", len(documents), query)
    result_docs = documents[:top_k]
    return self.score_results(result_docs, len(answer_box) > 0)

SerperDev ¶

Serper.dev is a search engine that provides a REST API to access search results from Google. See the Serper.dev website for more details.

Source code in pipelines/pipelines/nodes/search_engine/providers.py

class SerperDev(SearchEngine):
    """
    Serper.dev is a search engine that provides a REST API to access search results from Google. See the [Serper.dev website](https://serper.dev.com/) for more details.
    """

    def __init__(
        self,
        api_key: str,
        top_k: Optional[int] = 10,
        engine: Optional[str] = "google",
        search_engine_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """
        :param api_key: API key for Serper.dev API.
        :param top_k: Number of results to return.
        :param engine: Search engine to use, only supports Google.
        :param search_engine_kwargs: Additional parameters passed to the SerperDev API. For example, you can set 'hl' to 'en'
        to set the search results language to English.
        See the [Serper.dev documentation](https://serper.dev/playground) for the full list of supported parameters.
        """
        super().__init__()
        self.params_dict: Dict[str, Union[str, int, float]] = {}
        self.api_key = api_key
        self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
        self.engine = engine
        self.top_k = top_k

    def search(self, query: str, **kwargs) -> List[Document]:
        """
        :param query: Query string.
        :param kwargs: Additional parameters passed to the Serper.dev API. For example, you can set 'hl' to 'en'
        to set the search results language to English.
        See the [Serper.dev documentation](https://serper.dev/playground) for the full list of supported parameters.
        :return: List[Document]
        """
        kwargs = {**self.kwargs, **kwargs}
        top_k = kwargs.pop("top_k", self.top_k)
        url = "https://google.serper.dev/search"

        params = {"q": query, **kwargs}

        headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"}

        response = requests.post(url, headers=headers, json=params, timeout=30)

        if response.status_code != 200:
            raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

        json_result = json.loads(response.text)
        organic = [
            Document.from_dict(d, field_map={"snippet": "content"}) for d in json_result["organic"] if "snippet" in d
        ]
        answer_box = []
        if "answerBox" in json_result:
            answer_dict = json_result["answerBox"]
            for key in ["answer", "snippetHighlighted", "snippet", "title"]:
                if key in answer_dict:
                    answer_box_content = answer_dict[key]
                    if isinstance(answer_box_content, list):
                        answer_box_content = answer_box_content[0]
                    answer_box = [
                        Document.from_dict(
                            {
                                "title": answer_dict.get("title", ""),
                                "content": answer_box_content,
                                "link": answer_dict.get("link", ""),
                            }
                        )
                    ]
                    break

        people_also_search = []
        if "peopleAlsoSearchFor" in json_result:
            for result in json_result["peopleAlsoSearchFor"]:
                people_also_search.append(
                    Document.from_dict(
                        {
                            "title": result["title"],
                            "content": result["snippet"] if result.get("snippet") else result["title"],
                            "link": result["link"],
                        }
                    )
                )

        related_searches = []
        if "relatedSearches" in json_result:
            for result in json_result["relatedSearches"]:
                related_searches.append(Document.from_dict({"content": result.get("query", "")}))

        related_questions = []
        if "peopleAlsoAsk" in json_result:
            for result in json_result["peopleAlsoAsk"]:
                related_questions.append(
                    Document.from_dict(
                        {
                            "title": result["title"],
                            "content": result["snippet"] if result.get("snippet") else result["title"],
                            "link": result["link"],
                        }
                    )
                )

        documents = answer_box + organic + people_also_search + related_searches + related_questions

        logger.debug("Serper.dev API returned %s documents for the query '%s'", len(documents), query)
        result_docs = documents[:top_k]
        return self.score_results(result_docs, len(answer_box) > 0)

init ¶

__init__(api_key: str, top_k: Optional[int] = 10, engine: Optional[str] = 'google', search_engine_kwargs: Optional[Dict[str, Any]] = None)

Parameters:

Name	Type	Description	Default
`api_key`	`str`	API key for Serper.dev API.	required
`top_k`	`Optional[int]`	Number of results to return.	`10`
`engine`	`Optional[str]`	Search engine to use, only supports Google.	`'google'`
`search_engine_kwargs`	`Optional[Dict[str, Any]]`	Additional parameters passed to the SerperDev API. For example, you can set 'hl' to 'en' to set the search results language to English. See the Serper.dev documentation for the full list of supported parameters.	`None`

Source code in pipelines/pipelines/nodes/search_engine/providers.py

def __init__(
    self,
    api_key: str,
    top_k: Optional[int] = 10,
    engine: Optional[str] = "google",
    search_engine_kwargs: Optional[Dict[str, Any]] = None,
):
    """
    :param api_key: API key for Serper.dev API.
    :param top_k: Number of results to return.
    :param engine: Search engine to use, only supports Google.
    :param search_engine_kwargs: Additional parameters passed to the SerperDev API. For example, you can set 'hl' to 'en'
    to set the search results language to English.
    See the [Serper.dev documentation](https://serper.dev/playground) for the full list of supported parameters.
    """
    super().__init__()
    self.params_dict: Dict[str, Union[str, int, float]] = {}
    self.api_key = api_key
    self.kwargs = search_engine_kwargs if search_engine_kwargs else {}
    self.engine = engine
    self.top_k = top_k

search ¶

search(query: str, **kwargs) -> List[Document]

Parameters:

Name	Type	Description	Default
`query`	`str`	Query string.	required
`kwargs`		Additional parameters passed to the Serper.dev API. For example, you can set 'hl' to 'en' to set the search results language to English. See the Serper.dev documentation for the full list of supported parameters.	`{}`

Returns:

Type	Description
`List[Document]`	List[Document]

Source code in pipelines/pipelines/nodes/search_engine/providers.py

def search(self, query: str, **kwargs) -> List[Document]:
    """
    :param query: Query string.
    :param kwargs: Additional parameters passed to the Serper.dev API. For example, you can set 'hl' to 'en'
    to set the search results language to English.
    See the [Serper.dev documentation](https://serper.dev/playground) for the full list of supported parameters.
    :return: List[Document]
    """
    kwargs = {**self.kwargs, **kwargs}
    top_k = kwargs.pop("top_k", self.top_k)
    url = "https://google.serper.dev/search"

    params = {"q": query, **kwargs}

    headers = {"X-API-KEY": self.api_key, "Content-Type": "application/json"}

    response = requests.post(url, headers=headers, json=params, timeout=30)

    if response.status_code != 200:
        raise Exception(f"Error while querying {self.__class__.__name__}: {response.text}")

    json_result = json.loads(response.text)
    organic = [
        Document.from_dict(d, field_map={"snippet": "content"}) for d in json_result["organic"] if "snippet" in d
    ]
    answer_box = []
    if "answerBox" in json_result:
        answer_dict = json_result["answerBox"]
        for key in ["answer", "snippetHighlighted", "snippet", "title"]:
            if key in answer_dict:
                answer_box_content = answer_dict[key]
                if isinstance(answer_box_content, list):
                    answer_box_content = answer_box_content[0]
                answer_box = [
                    Document.from_dict(
                        {
                            "title": answer_dict.get("title", ""),
                            "content": answer_box_content,
                            "link": answer_dict.get("link", ""),
                        }
                    )
                ]
                break

    people_also_search = []
    if "peopleAlsoSearchFor" in json_result:
        for result in json_result["peopleAlsoSearchFor"]:
            people_also_search.append(
                Document.from_dict(
                    {
                        "title": result["title"],
                        "content": result["snippet"] if result.get("snippet") else result["title"],
                        "link": result["link"],
                    }
                )
            )

    related_searches = []
    if "relatedSearches" in json_result:
        for result in json_result["relatedSearches"]:
            related_searches.append(Document.from_dict({"content": result.get("query", "")}))

    related_questions = []
    if "peopleAlsoAsk" in json_result:
        for result in json_result["peopleAlsoAsk"]:
            related_questions.append(
                Document.from_dict(
                    {
                        "title": result["title"],
                        "content": result["snippet"] if result.get("snippet") else result["title"],
                        "link": result["link"],
                    }
                )
            )

    documents = answer_box + organic + people_also_search + related_searches + related_questions

    logger.debug("Serper.dev API returned %s documents for the query '%s'", len(documents), query)
    result_docs = documents[:top_k]
    return self.score_results(result_docs, len(answer_box) > 0)

pipelines.pipelines.nodes.search_engine.web ¶

WebSearch ¶

WebSearch queries a search engine and retrieves results as a list of Documents. WebSearch abstracts away the details of the underlying search engine provider, provides common interface for all providers, and makes it possible to use various search engines.

WebSerach currently supports the following search engines providers (bridges): - SerperDev (default) - SearchApi - SerpAPI - BingAPI

Source code in pipelines/pipelines/nodes/search_engine/web.py

class WebSearch(BaseComponent):
    """
    WebSearch queries a search engine and retrieves results as a list of Documents. WebSearch abstracts away the details
    of the underlying search engine provider, provides common interface for all providers, and makes it possible to use various
    search engines.

    WebSerach currently supports the following search engines providers (bridges):
    - SerperDev (default)
    - SearchApi
    - SerpAPI
    - BingAPI

    """

    outgoing_edges = 1

    def __init__(
        self,
        api_key: str,
        top_k: Optional[int] = 10,
        search_engine_provider: Union[str, SearchEngine] = "SerpAPI",
        engine: Optional[str] = "google",
        search_engine_kwargs: Optional[Dict[str, Any]] = None,
    ):
        """
        :param api_key: API key for the search engine provider.
        :param search_engine_provider: Name of the search engine provider class, see `providers.py` for a list of
        supported providers.
        :param search_engine_kwargs: Additional parameters to pass to the search engine provider.
        """
        super().__init__()
        if isinstance(search_engine_provider, str):
            # try to find the provider class
            search_path = [f"pipelines.nodes.search_engine.providers.{search_engine_provider}", search_engine_provider]
            klass: Type[SearchEngine] = next((pydoc.locate(path) for path in search_path), None)  # type: ignore

            if not klass:
                raise ValueError(
                    f"Could not locate the SearchEngine class with the name {search_engine_provider}. "
                    f"Make sure you pass the full path to the class."
                )
            if not issubclass(klass, SearchEngine):
                raise ValueError(f"Class {search_engine_provider} is not a subclass of SearchEngine.")
            self.search_engine = klass(api_key=api_key, top_k=top_k, engine=engine, search_engine_kwargs=search_engine_kwargs)  # type: ignore
        elif isinstance(search_engine_provider, SearchEngine):
            self.search_engine = search_engine_provider
        else:
            raise ValueError(
                "search_engine_provider must be either a string (SearchEngine class name) or a SearchEngine instance."
            )

    def run(
        self,
        query: Optional[str] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[MultiLabel] = None,
        documents: Optional[List[Document]] = None,
        meta: Optional[dict] = None,
    ) -> Tuple[Dict, str]:
        """
        Search the search engine for the given query and return the results. Only the query parameter is used.
        :param query: The query to search for.

        :return: List of search results as documents.
        """
        # query is a required parameter for search, we need to keep the signature of run() the same as in other nodes
        if not query:
            raise ValueError("WebSearch run requires the `query` parameter")
        return {"documents": self.search_engine.search(query)}, "output_1"

    def run_batch(
        self,
        queries: Optional[Union[str, List[str]]] = None,
        file_paths: Optional[List[str]] = None,
        labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None,
        documents: Optional[Union[List[Document], List[List[Document]]]] = None,
        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
        params: Optional[dict] = None,
        debug: Optional[bool] = None,
    ):
        results = []
        if isinstance(queries, str):
            queries = [queries]
        elif not isinstance(queries, list):
            raise ValueError("WebSearch run_batch requires the `queries` parameter to be Union[str, List[str]]")
        for query in queries:
            results.append(self.search_engine.search(query))
        return {"documents": results}, "output_1"

init ¶

__init__(api_key: str, top_k: Optional[int] = 10, search_engine_provider: Union[str, SearchEngine] = 'SerpAPI', engine: Optional[str] = 'google', search_engine_kwargs: Optional[Dict[str, Any]] = None)

Parameters:

Name	Type	Description	Default
`api_key`	`str`	API key for the search engine provider.	required
`search_engine_provider`	`Union[str, SearchEngine]`	Name of the search engine provider class, see `providers.py` for a list of supported providers.	`'SerpAPI'`
`search_engine_kwargs`	`Optional[Dict[str, Any]]`	Additional parameters to pass to the search engine provider.	`None`

Source code in pipelines/pipelines/nodes/search_engine/web.py

def __init__(
    self,
    api_key: str,
    top_k: Optional[int] = 10,
    search_engine_provider: Union[str, SearchEngine] = "SerpAPI",
    engine: Optional[str] = "google",
    search_engine_kwargs: Optional[Dict[str, Any]] = None,
):
    """
    :param api_key: API key for the search engine provider.
    :param search_engine_provider: Name of the search engine provider class, see `providers.py` for a list of
    supported providers.
    :param search_engine_kwargs: Additional parameters to pass to the search engine provider.
    """
    super().__init__()
    if isinstance(search_engine_provider, str):
        # try to find the provider class
        search_path = [f"pipelines.nodes.search_engine.providers.{search_engine_provider}", search_engine_provider]
        klass: Type[SearchEngine] = next((pydoc.locate(path) for path in search_path), None)  # type: ignore

        if not klass:
            raise ValueError(
                f"Could not locate the SearchEngine class with the name {search_engine_provider}. "
                f"Make sure you pass the full path to the class."
            )
        if not issubclass(klass, SearchEngine):
            raise ValueError(f"Class {search_engine_provider} is not a subclass of SearchEngine.")
        self.search_engine = klass(api_key=api_key, top_k=top_k, engine=engine, search_engine_kwargs=search_engine_kwargs)  # type: ignore
    elif isinstance(search_engine_provider, SearchEngine):
        self.search_engine = search_engine_provider
    else:
        raise ValueError(
            "search_engine_provider must be either a string (SearchEngine class name) or a SearchEngine instance."
        )

run ¶

run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[dict] = None) -> Tuple[Dict, str]

Search the search engine for the given query and return the results. Only the query parameter is used.

Parameters:

Name	Type	Description	Default
`query`	`Optional[str]`	The query to search for.	`None`

Returns:

Type	Description
`Tuple[Dict, str]`	List of search results as documents.

Source code in pipelines/pipelines/nodes/search_engine/web.py

def run(
    self,
    query: Optional[str] = None,
    file_paths: Optional[List[str]] = None,
    labels: Optional[MultiLabel] = None,
    documents: Optional[List[Document]] = None,
    meta: Optional[dict] = None,
) -> Tuple[Dict, str]:
    """
    Search the search engine for the given query and return the results. Only the query parameter is used.
    :param query: The query to search for.

    :return: List of search results as documents.
    """
    # query is a required parameter for search, we need to keep the signature of run() the same as in other nodes
    if not query:
        raise ValueError("WebSearch run requires the `query` parameter")
    return {"documents": self.search_engine.search(query)}, "output_1"

Search Engine Module¶

pipelines.pipelines.nodes.search_engine.providers ¶

SearchApi ¶

__init__ ¶

search ¶

SerpAPI ¶

__init__ ¶

search ¶

SerperDev ¶

__init__ ¶

search ¶

pipelines.pipelines.nodes.search_engine.web ¶

WebSearch ¶

__init__ ¶

run ¶

init ¶

init ¶

init ¶

init ¶