Răsfoiți Sursa

Merge pull request #13528 from Classic298/dev

feat: Enhance YouTube Transcription Loader for multi-language support
Tim Jaeryang Baek 5 luni în urmă
părinte
comite
ea07e242f5
1 a modificat fișierele cu 34 adăugiri și 19 ștergeri
  1. 34 19
      backend/open_webui/retrieval/loaders/youtube.py

+ 34 - 19
backend/open_webui/retrieval/loaders/youtube.py

@@ -62,12 +62,17 @@ class YoutubeLoader:
         _video_id = _parse_video_id(video_id)
         self.video_id = _video_id if _video_id is not None else video_id
         self._metadata = {"source": video_id}
-        self.language = language
         self.proxy_url = proxy_url
+        
+        # Ensure language is a list
         if isinstance(language, str):
             self.language = [language]
         else:
-            self.language = language
+            self.language = list(language)
+        
+        # Add English as fallback if not already in the list
+        if "en" not in self.language:
+            self.language.append("en")
 
     def load(self) -> List[Document]:
         """Load YouTube transcripts into `Document` objects."""
@@ -82,7 +87,7 @@ class YoutubeLoader:
                 'Could not import "youtube_transcript_api" Python package. '
                 "Please install it with `pip install youtube-transcript-api`."
             )
-
+    
         if self.proxy_url:
             youtube_proxies = {
                 "http": self.proxy_url,
@@ -92,7 +97,7 @@ class YoutubeLoader:
             log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
         else:
             youtube_proxies = None
-
+        
         try:
             transcript_list = YouTubeTranscriptApi.list_transcripts(
                 self.video_id, proxies=youtube_proxies
@@ -100,18 +105,28 @@ class YoutubeLoader:
         except Exception as e:
             log.exception("Loading YouTube transcript failed")
             return []
-
-        try:
-            transcript = transcript_list.find_transcript(self.language)
-        except NoTranscriptFound:
-            transcript = transcript_list.find_transcript(["en"])
-
-        transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
-
-        transcript = " ".join(
-            map(
-                lambda transcript_piece: transcript_piece.text.strip(" "),
-                transcript_pieces,
-            )
-        )
-        return [Document(page_content=transcript, metadata=self._metadata)]
+        
+        # Try each language in order of priority
+        for lang in self.language:
+            try:
+                transcript = transcript_list.find_transcript([lang])
+                log.debug(f"Found transcript for language '{lang}'")
+                transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+                transcript_text = " ".join(
+                    map(
+                        lambda transcript_piece: transcript_piece.text.strip(" "),
+                        transcript_pieces,
+                    )
+                )
+                return [Document(page_content=transcript_text, metadata=self._metadata)]
+            except NoTranscriptFound:
+                log.debug(f"No transcript found for language '{lang}'")
+                continue
+            except Exception as e:
+                log.info(f"Error finding transcript for language '{lang}'")
+                raise e
+    
+        # If we get here, all languages failed
+        languages_tried = ", ".join(self.language)
+        log.warning(f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed.")
+        raise NoTranscriptFound(f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed.")