1
0
Classic298 5 сар өмнө
parent
commit
0a845db8ec

+ 60 - 60
backend/open_webui/retrieval/loaders/youtube.py

@@ -70,67 +70,67 @@ class YoutubeLoader:
             self.language = language
 
     def load(self) -> List[Document]:
-    """Load YouTube transcripts into `Document` objects."""
-    try:
-        from youtube_transcript_api import (
-            NoTranscriptFound,
-            TranscriptsDisabled,
-            YouTubeTranscriptApi,
-        )
-    except ImportError:
-        raise ImportError(
-            'Could not import "youtube_transcript_api" Python package. '
-            "Please install it with `pip install youtube-transcript-api`."
-        )
-
-    if self.proxy_url:
-        youtube_proxies = {
-            "http": self.proxy_url,
-            "https": self.proxy_url,
-        }
-        # Don't log complete URL because it might contain secrets
-        log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
-    else:
-        youtube_proxies = None
-
-    try:
-        transcript_list = YouTubeTranscriptApi.list_transcripts(
-            self.video_id, proxies=youtube_proxies
-        )
-    except Exception as e:
-        log.exception("Loading YouTube transcript failed")
-        return []
-
-    # Try each language in order of priority
-    last_exception = None
-    for lang in self.language:
+        """Load YouTube transcripts into `Document` objects."""
         try:
-            log.debug(f"Attempting to find transcript for language '{lang}'")
-            transcript = transcript_list.find_transcript([lang])
-            log.info(f"Found transcript for language '{lang}'")
-            
-            transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
-            transcript_text = " ".join(
-                map(
-                    lambda transcript_piece: transcript_piece.text.strip(" "),
-                    transcript_pieces,
-                )
+            from youtube_transcript_api import (
+                NoTranscriptFound,
+                TranscriptsDisabled,
+                YouTubeTranscriptApi,
+            )
+        except ImportError:
+            raise ImportError(
+                'Could not import "youtube_transcript_api" Python package. '
+                "Please install it with `pip install youtube-transcript-api`."
+            )
+    
+        if self.proxy_url:
+            youtube_proxies = {
+                "http": self.proxy_url,
+                "https": self.proxy_url,
+            }
+            # Don't log complete URL because it might contain secrets
+            log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
+        else:
+            youtube_proxies = None
+    
+        try:
+            transcript_list = YouTubeTranscriptApi.list_transcripts(
+                self.video_id, proxies=youtube_proxies
             )
-            return [Document(page_content=transcript_text, metadata=self._metadata)]
-        except NoTranscriptFound as e:
-            log.debug(f"No transcript found for language '{lang}'")
-            last_exception = e
-            continue
         except Exception as e:
-            # If we hit any other type of exception, log it and re-raise
-            log.exception(f"Error finding transcript for language '{lang}'")
-            raise e
-
-    # If all specified languages fail, raise the last exception
-    # This maintains compatibility with the error handling in the rest of the application
-    if last_exception:
-        log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
-        raise last_exception
+            log.exception("Loading YouTube transcript failed")
+            return []
+    
+        # Try each language in order of priority
+        last_exception = None
+        for lang in self.language:
+            try:
+                log.debug(f"Attempting to find transcript for language '{lang}'")
+                transcript = transcript_list.find_transcript([lang])
+                log.info(f"Found transcript for language '{lang}'")
+                
+                transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+                transcript_text = " ".join(
+                    map(
+                        lambda transcript_piece: transcript_piece.text.strip(" "),
+                        transcript_pieces,
+                    )
+                )
+                return [Document(page_content=transcript_text, metadata=self._metadata)]
+            except NoTranscriptFound as e:
+                log.debug(f"No transcript found for language '{lang}'")
+                last_exception = e
+                continue
+            except Exception as e:
+                # If we hit any other type of exception, log it and re-raise
+                log.exception(f"Error finding transcript for language '{lang}'")
+                raise e
     
-    # This should never happen (we'd have raised an exception above)
-    return []
+        # If all specified languages fail, raise the last exception
+        # This maintains compatibility with the error handling in the rest of the application
+        if last_exception:
+            log.warning(f"No transcript found for any of the specified languages: {', '.join(self.language)}")
+            raise last_exception
+        
+        # This should never happen (we'd have raised an exception above)
+        return []