1
0

utils.py 776 B

12345678910111213141516171819202122232425262728
  1. from datetime import datetime
  2. KEYS_TO_EXCLUDE = ["content", "pages", "tables", "paragraphs", "sections", "figures"]
  3. def filter_metadata(metadata: dict[str, any]) -> dict[str, any]:
  4. metadata = {
  5. key: value for key, value in metadata.items() if key not in KEYS_TO_EXCLUDE
  6. }
  7. return metadata
  8. def process_metadata(
  9. metadata: dict[str, any],
  10. ) -> dict[str, any]:
  11. for key, value in metadata.items():
  12. # Remove large fields
  13. if key in KEYS_TO_EXCLUDE:
  14. del metadata[key]
  15. # Convert non-serializable fields to strings
  16. if (
  17. isinstance(value, datetime)
  18. or isinstance(value, list)
  19. or isinstance(value, dict)
  20. ):
  21. metadata[key] = str(value)
  22. return metadata