Browse Source

fixes #14752 and adds manual transcription option

lucy 4 months ago
parent
commit
b0965a8184
1 changed files with 28 additions and 6 deletions
  1. 28 6
      backend/open_webui/retrieval/loaders/youtube.py

+ 28 - 6
backend/open_webui/retrieval/loaders/youtube.py

@@ -1,4 +1,5 @@
 import logging
+from xml.etree.ElementTree import ParseError
 
 from typing import Any, Dict, Generator, List, Optional, Sequence, Union
 from urllib.parse import parse_qs, urlparse
@@ -93,7 +94,6 @@ class YoutubeLoader:
                 "http": self.proxy_url,
                 "https": self.proxy_url,
             }
-            # Don't log complete URL because it might contain secrets
             log.debug(f"Using proxy URL: {self.proxy_url[:14]}...")
         else:
             youtube_proxies = None
@@ -110,11 +110,35 @@ class YoutubeLoader:
         for lang in self.language:
             try:
                 transcript = transcript_list.find_transcript([lang])
+                if transcript.is_generated:
+                    log.debug(f"Found generated transcript for language '{lang}'")
+                    try:
+                        transcript = transcript_list.find_manually_created_transcript(
+                            [lang]
+                        )
+                        log.debug(f"Found manual transcript for language '{lang}'")
+                    except NoTranscriptFound:
+                        log.debug(
+                            f"No manual transcript found for language '{lang}', using generated"
+                        )
+                        pass
+
                 log.debug(f"Found transcript for language '{lang}'")
-                transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+                try:
+                    transcript_pieces: List[Dict[str, Any]] = transcript.fetch()
+                except ParseError:
+                    log.debug(f"Empty or invalid transcript for language '{lang}'")
+                    continue
+
+                if not transcript_pieces:
+                    log.debug(f"Empty transcript for language '{lang}'")
+                    continue
+
                 transcript_text = " ".join(
                     map(
-                        lambda transcript_piece: transcript_piece.text.strip(" "),
+                        lambda transcript_piece: transcript_piece.text.strip(" ")
+                        if hasattr(transcript_piece, "text")
+                        else "",
                         transcript_pieces,
                     )
                 )
@@ -131,6 +155,4 @@ class YoutubeLoader:
         log.warning(
             f"No transcript found for any of the specified languages: {languages_tried}. Verify if the video has transcripts, add more languages if needed."
         )
-        raise NoTranscriptFound(
-            f"No transcript found for any supported language. Verify if the video has transcripts, add more languages if needed."
-        )
+        raise NoTranscriptFound(self.video_id, self.language, list(transcript_list))