test_mistral_tokenizer.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from transformers import AutoTokenizer, AutoProcessor
  2. def test_tokenizer(name, tokenizer, verbose=False):
  3. print(f"--- {name} ({tokenizer.__class__.__name__}) ---")
  4. text = "Hello! How can I assist you today? Let me know if you need help with something or just want to chat."
  5. encoded = tokenizer.encode(text)
  6. decoded = tokenizer.decode(encoded)
  7. print(f"{encoded=}")
  8. print(f"{decoded=}")
  9. reconstructed = ""
  10. for token in encoded:
  11. if verbose:
  12. print(f"{token=}")
  13. print(f"{tokenizer.decode([token])=}")
  14. reconstructed += tokenizer.decode([token])
  15. print(f"{reconstructed=}")
  16. strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
  17. assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
  18. # test_tokenizer(AutoTokenizer.from_pretrained("mlx-community/Mistral-Nemo-Instruct-2407-4bit"))
  19. models = [
  20. "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
  21. "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated",
  22. "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit",
  23. "NousResearch/Meta-Llama-3.1-70B",
  24. "mlx-community/Meta-Llama-3.1-405B-4bit",
  25. "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
  26. "mlx-community/Meta-Llama-3-70B-Instruct-4bit",
  27. # "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx",
  28. # "llava-hf/llava-1.5-7b-hf",
  29. "mlx-community/Mistral-Nemo-Instruct-2407-4bit",
  30. "mlx-community/Mistral-Large-Instruct-2407-4bit",
  31. ]
  32. import os
  33. verbose = os.environ.get("VERBOSE", "0").lower() == "1"
  34. for m in models:
  35. # TODO: figure out why use_fast=False is giving inconsistent behaviour (no spaces decoding invididual tokens) for Mistral-Large-Instruct-2407-4bit
  36. # test_tokenizer(m, AutoProcessor.from_pretrained(m, use_fast=False), verbose)
  37. test_tokenizer(m, AutoProcessor.from_pretrained(m, use_fast=True), verbose)
  38. test_tokenizer(m, AutoTokenizer.from_pretrained(m), verbose)