|
@@ -29,6 +29,7 @@ import tiktoken
|
|
|
|
|
|
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
|
|
|
+from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
|
from langchain_core.documents import Document
|
|
|
|
|
|
from open_webui.models.files import FileModel, Files
|
|
@@ -1146,6 +1147,7 @@ def save_docs_to_vector_db(
|
|
|
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
|
|
add_start_index=True,
|
|
|
)
|
|
|
+ docs = text_splitter.split_documents(docs)
|
|
|
elif request.app.state.config.TEXT_SPLITTER == "token":
|
|
|
log.info(
|
|
|
f"Using token text splitter: {request.app.state.config.TIKTOKEN_ENCODING_NAME}"
|
|
@@ -1158,11 +1160,52 @@ def save_docs_to_vector_db(
|
|
|
chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
|
|
add_start_index=True,
|
|
|
)
|
|
|
+ docs = text_splitter.split_documents(docs)
|
|
|
+ elif request.app.state.config.TEXT_SPLITTER == "markdown_header":
|
|
|
+ log.info("Using markdown header text splitter")
|
|
|
+
|
|
|
+ # Define headers to split on - covering most common markdown header levels
|
|
|
+ headers_to_split_on = [
|
|
|
+ ("#", "Header 1"),
|
|
|
+ ("##", "Header 2"),
|
|
|
+ ("###", "Header 3"),
|
|
|
+ ("####", "Header 4"),
|
|
|
+ ("#####", "Header 5"),
|
|
|
+ ("######", "Header 6"),
|
|
|
+ ]
|
|
|
+
|
|
|
+ markdown_splitter = MarkdownHeaderTextSplitter(
|
|
|
+ headers_to_split_on=headers_to_split_on,
|
|
|
+ strip_headers=False, # Keep headers in content for context
|
|
|
+ )
|
|
|
+
|
|
|
+ md_split_docs = []
|
|
|
+ for doc in docs:
|
|
|
+ md_header_splits = markdown_splitter.split_text(doc.page_content)
|
|
|
+ text_splitter = RecursiveCharacterTextSplitter(
|
|
|
+ chunk_size=request.app.state.config.CHUNK_SIZE,
|
|
|
+ chunk_overlap=request.app.state.config.CHUNK_OVERLAP,
|
|
|
+ add_start_index=True,
|
|
|
+ )
|
|
|
+ md_header_splits = text_splitter.split_documents(md_header_splits)
|
|
|
+
|
|
|
+ # Convert back to Document objects, preserving original metadata
|
|
|
+ for split_chunk in md_header_splits:
|
|
|
+ headings_list = []
|
|
|
+ # Extract header values in order based on headers_to_split_on
|
|
|
+ for _, header_meta_key_name in headers_to_split_on:
|
|
|
+ if header_meta_key_name in split_chunk.metadata:
|
|
|
+ headings_list.append(split_chunk.metadata[header_meta_key_name])
|
|
|
+
|
|
|
+ md_split_docs.append(Document(
|
|
|
+ page_content=split_chunk.page_content,
|
|
|
+ metadata={**doc.metadata, "headings": headings_list}
|
|
|
+ ))
|
|
|
+
|
|
|
+ docs = md_split_docs
|
|
|
else:
|
|
|
raise ValueError(ERROR_MESSAGES.DEFAULT("Invalid text splitter"))
|
|
|
|
|
|
- docs = text_splitter.split_documents(docs)
|
|
|
-
|
|
|
if len(docs) == 0:
|
|
|
raise ValueError(ERROR_MESSAGES.EMPTY_CONTENT)
|
|
|
|