test_tokenizers.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import os
  2. import re
  3. from transformers import AutoTokenizer, AutoProcessor
  4. from exo.models import model_cards
  5. def test_tokenizer(name, tokenizer, verbose=False):
  6. print(f"--- {name} ({tokenizer.__class__.__name__}) ---")
  7. text = "Hello! How can I assist you today? Let me know if you need help with something or just want to chat."
  8. encoded = tokenizer.encode(text)
  9. decoded = tokenizer.decode(encoded)
  10. print(f"{encoded=}")
  11. print(f"{decoded=}")
  12. reconstructed = ""
  13. for token in encoded:
  14. if verbose:
  15. print(f"{token=}")
  16. print(f"{tokenizer.decode([token])=}")
  17. reconstructed += tokenizer.decode([token])
  18. print(f"{reconstructed=}")
  19. strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
  20. assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
  21. ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*", "dummy", "mlx-community/Meta-Llama-3.1-405B-Instruct-8bit"]
  22. ignore_pattern = re.compile(r"^(" + "|".join(model.replace("*", ".*") for model in ignore) + r")")
  23. models = []
  24. for model_id in model_cards:
  25. for engine_type, repo_id in model_cards[model_id].get("repo", {}).items():
  26. if not ignore_pattern.match(repo_id):
  27. models.append(repo_id)
  28. models = list(set(models))
  29. verbose = os.environ.get("VERBOSE", "0").lower() == "1"
  30. for m in models:
  31. # TODO: figure out why use_fast=False is giving inconsistent behaviour (no spaces decoding invididual tokens) for Mistral-Large-Instruct-2407-4bit
  32. # test_tokenizer(m, AutoProcessor.from_pretrained(m, use_fast=False), verbose)
  33. test_tokenizer(m, AutoProcessor.from_pretrained(m, use_fast=True), verbose)
  34. test_tokenizer(m, AutoTokenizer.from_pretrained(m), verbose)