google_pse.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. import logging
  2. from typing import Optional
  3. import requests
  4. from open_webui.retrieval.web.main import SearchResult, get_filtered_results
  5. from open_webui.env import SRC_LOG_LEVELS
  6. log = logging.getLogger(__name__)
  7. log.setLevel(SRC_LOG_LEVELS["RAG"])
  8. def search_google_pse(
  9. api_key: str,
  10. search_engine_id: str,
  11. query: str,
  12. count: int,
  13. filter_list: Optional[list[str]] = None,
  14. referer: Optional[str] = None,
  15. ) -> list[SearchResult]:
  16. """Search using Google's Programmable Search Engine API and return the results as a list of SearchResult objects.
  17. Handles pagination for counts greater than 10.
  18. Args:
  19. api_key (str): A Programmable Search Engine API key
  20. search_engine_id (str): A Programmable Search Engine ID
  21. query (str): The query to search for
  22. count (int): The number of results to return (max 100, as PSE max results per query is 10 and max page is 10)
  23. filter_list (Optional[list[str]], optional): A list of keywords to filter out from results. Defaults to None.
  24. Returns:
  25. list[SearchResult]: A list of SearchResult objects.
  26. """
  27. url = "https://www.googleapis.com/customsearch/v1"
  28. headers = {"Content-Type": "application/json"}
  29. if referer:
  30. headers["Referer"] = referer
  31. all_results = []
  32. start_index = 1 # Google PSE start parameter is 1-based
  33. while count > 0:
  34. num_results_this_page = min(count, 10) # Google PSE max results per page is 10
  35. params = {
  36. "cx": search_engine_id,
  37. "q": query,
  38. "key": api_key,
  39. "num": num_results_this_page,
  40. "start": start_index,
  41. }
  42. response = requests.request("GET", url, headers=headers, params=params)
  43. response.raise_for_status()
  44. json_response = response.json()
  45. results = json_response.get("items", [])
  46. if results: # check if results are returned. If not, no more pages to fetch.
  47. all_results.extend(results)
  48. count -= len(
  49. results
  50. ) # Decrement count by the number of results fetched in this page.
  51. start_index += 10 # Increment start index for the next page
  52. else:
  53. break # No more results from Google PSE, break the loop
  54. if filter_list:
  55. all_results = get_filtered_results(all_results, filter_list)
  56. return [
  57. SearchResult(
  58. link=result["link"],
  59. title=result.get("title"),
  60. snippet=result.get("snippet"),
  61. )
  62. for result in all_results
  63. ]