12345678910111213141516171819202122232425262728 |
- from datetime import datetime
- KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"]
- def filter_metadata(metadata: dict[str, any]) -> dict[str, any]:
- metadata = {
- key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE
- }
- return metadata
- def process_metadata(
- metadata: dict[str, any],
- ) -> dict[str, any]:
- for key, value in metadata.items():
- # Remove large fields
- if key in KEYS_TO_EXCLUDE:
- del metadata[key]
- # Convert non-serializable fields to strings
- if (
- isinstance(value, datetime)
- or isinstance(value, list)
- or isinstance(value, dict)
- ):
- metadata[key] = str(value)
- return metadata
|