test_tokenizers.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334
  1. from transformers import AutoTokenizer, AutoProcessor
  2. from exo.models import model_base_shards
  3. def test_tokenizer(name, tokenizer, verbose=False):
  4. print(f"--- {name} ({tokenizer.__class__.__name__}) ---")
  5. text = "Hello! How can I assist you today? Let me know if you need help with something or just want to chat."
  6. encoded = tokenizer.encode(text)
  7. decoded = tokenizer.decode(encoded)
  8. print(f"{encoded=}")
  9. print(f"{decoded=}")
  10. reconstructed = ""
  11. for token in encoded:
  12. if verbose:
  13. print(f"{token=}")
  14. print(f"{tokenizer.decode([token])=}")
  15. reconstructed += tokenizer.decode([token])
  16. print(f"{reconstructed=}")
  17. strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
  18. assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
  19. ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "llava-hf/llava-1.5-7b-hf"]
  20. models = [shard.model_id for shards in model_base_shards.values() for shard in shards.values() if shard.model_id not in ignore]
  21. import os
  22. verbose = os.environ.get("VERBOSE", "0").lower() == "1"
  23. for m in models:
  24. # TODO: figure out why use_fast=False is giving inconsistent behaviour (no spaces decoding invididual tokens) for Mistral-Large-Instruct-2407-4bit
  25. # test_tokenizer(m, AutoProcessor.from_pretrained(m, use_fast=False), verbose)
  26. test_tokenizer(m, AutoProcessor.from_pretrained(m, use_fast=True), verbose)
  27. test_tokenizer(m, AutoTokenizer.from_pretrained(m), verbose)