7 months ago · 675b1e16a8
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -20,12 +20,18 @@ commands:
 
				           command: |
			
 
				             source env/bin/activate
			
 
				 
			
 
				+            # Set CLANG=1 for tinygrad only
			
 
				+            if [ "<<parameters.inference_engine>>" = "tinygrad" ]; then
			
 
				+              pip install llvmlite
			
 
				+              export TOKENIZERS_PARALLELISM=true SUPPORT_BF16=0 CLANG=1
			
 
				+            fi
			
 
				+
			
 
				             # Start first instance
			
 
				-            HF_HOME="$(pwd)/.hf_cache_node1" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout 900 2>&1 | tee output1.log &
			
 
				+            HF_HOME="$(pwd)/.hf_cache_node1" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout 900 --disable-tui 2>&1 | tee output1.log &
			
 
				             PID1=$!
			
 
				 
			
 
				             # Start second instance
			
 
				-            HF_HOME="$(pwd)/.hf_cache_node2" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout 900 2>&1 | tee output2.log &
			
 
				+            HF_HOME="$(pwd)/.hf_cache_node2" DEBUG_DISCOVERY=7 DEBUG=7 exo --inference-engine <<parameters.inference_engine>> --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout 900 --disable-tui 2>&1 | tee output2.log &
			
 
				             PID2=$!
			
 
				 
			
 
				             # Wait for discovery
			
@@ -48,13 +54,6 @@ commands:
 
				             # Check processes before proceeding
			
 
				             check_processes
			
 
				 
			
 
				-            # Special handling for dummy engine
			
 
				-            if [ "<<parameters.inference_engine>>" = "dummy" ]; then
			
 
				-              expected_content="This is a dummy response"
			
 
				-            else
			
 
				-              expected_content="Michael Jackson"
			
 
				-            fi
			
 
				-
			
 
				             echo "Sending request to first instance..."
			
 
				             response_1=$(curl -s http://localhost:8000/v1/chat/completions \
			
 
				               -H "Content-Type: application/json" \
			
@@ -127,6 +126,7 @@ jobs:
 
				             METAL_DEVICE_WRAPPER_TYPE=1 METAL_DEBUG_ERROR_MODE=0 METAL_XCODE=1 TEMPERATURE=0 python3 -m exo.inference.test_inference_engine
			
 
				             echo "Running tokenizer tests..."
			
 
				             python3 ./test/test_tokenizers.py
			
 
				+            python3 ./test/test_model_helpers.py
			
 
				 
			
 
				   discovery_integration_test:
			
 
				     macos:
			
@@ -149,9 +149,9 @@ jobs:
 
				           name: Run discovery integration test
			
 
				           command: |
			
 
				             source env/bin/activate
			
 
				-            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
			
 
				+            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --disable-tui > output1.log 2>&1 &
			
 
				             PID1=$!
			
 
				-            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
			
 
				+            DEBUG_DISCOVERY=7 DEBUG=7 exo --node-id "node2" --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --disable-tui > output2.log 2>&1 &
			
 
				             PID2=$!
			
 
				             sleep 10
			
 
				             kill $PID1 $PID2
			
@@ -223,29 +223,29 @@ jobs:
 
				       - checkout
			
 
				       - run: system_profiler SPHardwareDataType
			
 
				 
			
 
				-  # chatgpt_api_integration_test_tinygrad:
			
 
				-  #   macos:
			
 
				-  #     xcode: "16.0.0"
			
 
				-  #   resource_class: m2pro.large
			
 
				-  #   steps:
			
 
				-  #     - checkout
			
 
				-  #     - run:
			
 
				-  #         name: Set up Python
			
 
				-  #         command: |
			
 
				-  #           brew install python@3.12
			
 
				-  #           python3.12 -m venv env
			
 
				-  #           source env/bin/activate
			
 
				-  #     - run:
			
 
				-  #         name: Install dependencies
			
 
				-  #         command: |
			
 
				-  #           source env/bin/activate
			
 
				-  #           pip install --upgrade pip
			
 
				-  #           pip install .
			
 
				-  #     - run_chatgpt_api_test:
			
 
				-  #         inference_engine: tinygrad
			
 
				-  #         model_id: llama-3-8b
			
 
				-  #         prompt: "Keep responses concise. Who was the king of pop?"
			
 
				-  #         expected_output: "Michael Jackson"
			
 
				+  chatgpt_api_integration_test_tinygrad:
			
 
				+    macos:
			
 
				+      xcode: "16.0.0"
			
 
				+    resource_class: m2pro.large
			
 
				+    steps:
			
 
				+      - checkout
			
 
				+      - run:
			
 
				+          name: Set up Python
			
 
				+          command: |
			
 
				+            brew install python@3.12
			
 
				+            python3.12 -m venv env
			
 
				+            source env/bin/activate
			
 
				+      - run:
			
 
				+          name: Install dependencies
			
 
				+          command: |
			
 
				+            source env/bin/activate
			
 
				+            pip install --upgrade pip
			
 
				+            pip install .
			
 
				+      - run_chatgpt_api_test:
			
 
				+          inference_engine: tinygrad
			
 
				+          model_id: llama-3.2-1b
			
 
				+          prompt: "Keep responses concise. Who was the king of pop?"
			
 
				+          expected_output: "Michael Jackson"
			
 
				 
			
 
				 workflows:
			
 
				   version: 2
			
@@ -254,6 +254,6 @@ workflows:
 
				       - unit_test
			
 
				       - discovery_integration_test
			
 
				       - chatgpt_api_integration_test_mlx
			
 
				+      - chatgpt_api_integration_test_tinygrad
			
 
				       - chatgpt_api_integration_test_dummy
			
 
				       - test_macos_m1
			
 
				-      # - chatgpt_api_integration_test_tinygrad
			
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ test_weights.npz
 
				 .exo_used_ports
			
 
				 .exo_node_id
			
 
				 .idea
			
 
				+.DS_Store
			
 
				 
			
 
				 # Byte-compiled / optimized / DLL files
			
 
				 __pycache__/
			
@@ -15,7 +16,6 @@ __pycache__/
 
				 
			
 
				 # Distribution / packaging
			
 
				 /.Python
			
 
				-/build/
			
 
				 /develop-eggs/
			
 
				 /dist/
			
 
				 /downloads/
			
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,472 +0,0 @@
 
				-[MASTER]
			
 
				-
			
 
				-# A comma-separated list of package or module names from where C extensions may
			
 
				-# be loaded. Extensions are loading into the active Python interpreter and may
			
 
				-# run arbitrary code
			
 
				-extension-pkg-whitelist=scipy,cereal.messaging.messaging_pyx,PyQt5,av
			
 
				-
			
 
				-# Add files or directories to the blacklist. They should be base names, not
			
 
				-# paths.
			
 
				-ignore=CVS
			
 
				-
			
 
				-# Add files or directories matching the regex patterns to the blacklist. The
			
 
				-# regex matches against base names, not paths.
			
 
				-ignore-patterns=.*node_service_pb2.*
			
 
				-
			
 
				-# Python code to execute, usually for sys.path manipulation such as
			
 
				-# pygtk.require().
			
 
				-#init-hook=
			
 
				-
			
 
				-# Use multiple processes to speed up Pylint.
			
 
				-jobs=4
			
 
				-
			
 
				-# List of plugins (as comma separated values of python modules names) to load,
			
 
				-# usually to register additional checkers.
			
 
				-load-plugins=
			
 
				-
			
 
				-# Pickle collected data for later comparisons.
			
 
				-persistent=yes
			
 
				-
			
 
				-# Specify a configuration file.
			
 
				-#rcfile=
			
 
				-
			
 
				-# When enabled, pylint would attempt to guess common misconfiguration and emit
			
 
				-# user-friendly hints instead of false-positive error messages
			
 
				-suggestion-mode=yes
			
 
				-
			
 
				-# Allow loading of arbitrary C extensions. Extensions are imported into the
			
 
				-# active Python interpreter and may run arbitrary code.
			
 
				-unsafe-load-any-extension=no
			
 
				-
			
 
				-
			
 
				-[MESSAGES CONTROL]
			
 
				-
			
 
				-# Only show warnings with the listed confidence levels. Leave empty to show
			
 
				-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
			
 
				-confidence=
			
 
				-
			
 
				-# Disable the message, report, category or checker with the given id(s). You
			
 
				-# can either give multiple identifiers separated by comma (,) or put this
			
 
				-# option multiple times (only on the command line, not in the configuration
			
 
				-# file where it should appear only once).You can also use "--disable=all" to
			
 
				-# disable everything first and then reenable specific checks. For example, if
			
 
				-# you want to run only the similarities checker, you can use "--disable=all
			
 
				-# --enable=similarities". If you want to run only the classes checker, but have
			
 
				-# no Warning level messages displayed, use"--disable=all --enable=classes
			
 
				-# --disable=W"
			
 
				-disable=C,R,W0613,W0511,W0212,W0201,W0106,W0603,W0621,W0703,W1201,W1203,E1136,W1514,E1101,W0221,W0105,E0401
			
 
				-# E1101 for function binding
			
 
				-# W0221 for Function class
			
 
				-# W0105 for comment strings
			
 
				-# E0401 for missing imports
			
 
				-
			
 
				-# Enable the message, report, category or checker with the given id(s). You can
			
 
				-# either give multiple identifier separated by comma (,) or put this option
			
 
				-# multiple time (only on the command line, not in the configuration file where
			
 
				-# it should appear only once). See also the "--disable" option for examples.
			
 
				-enable=c-extension-no-member,use-a-generator, no-else-return
			
 
				-
			
 
				-
			
 
				-[REPORTS]
			
 
				-
			
 
				-# Python expression which should return a note less than 10 (10 is the highest
			
 
				-# note). You have access to the variables errors warning, statement which
			
 
				-# respectively contain the number of errors / warnings messages and the total
			
 
				-# number of statements analyzed. This is used by the global evaluation report
			
 
				-# (RP0004).
			
 
				-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
			
 
				-
			
 
				-# Template used to display messages. This is a python new-style format string
			
 
				-# used to format the message information. See doc for all details
			
 
				-#msg-template=
			
 
				-
			
 
				-# Set the output format. Available formats are text, parseable, colorized, json
			
 
				-# and msvs (visual studio).You can also give a reporter class, eg
			
 
				-# mypackage.mymodule.MyReporterClass.
			
 
				-output-format=text
			
 
				-
			
 
				-# Tells whether to display a full report or only the messages
			
 
				-reports=no
			
 
				-
			
 
				-# Activate the evaluation score.
			
 
				-score=yes
			
 
				-
			
 
				-
			
 
				-[REFACTORING]
			
 
				-
			
 
				-# Maximum number of nested blocks for function / method body
			
 
				-max-nested-blocks=5
			
 
				-
			
 
				-# Complete name of functions that never returns. When checking for
			
 
				-# inconsistent-return-statements if a never returning function is called then
			
 
				-# it will be considered as an explicit return statement and no message will be
			
 
				-# printed.
			
 
				-never-returning-functions=optparse.Values,sys.exit
			
 
				-
			
 
				-
			
 
				-[LOGGING]
			
 
				-
			
 
				-# Logging modules to check that the string format arguments are in logging
			
 
				-# function parameter format
			
 
				-logging-modules=logging
			
 
				-
			
 
				-
			
 
				-[SPELLING]
			
 
				-
			
 
				-# Limits count of emitted suggestions for spelling mistakes
			
 
				-max-spelling-suggestions=4
			
 
				-
			
 
				-# Spelling dictionary name. Available dictionaries: none. To make it working
			
 
				-# install python-enchant package.
			
 
				-spelling-dict=
			
 
				-
			
 
				-# List of comma separated words that should not be checked.
			
 
				-spelling-ignore-words=
			
 
				-
			
 
				-# A path to a file that contains private dictionary; one word per line.
			
 
				-spelling-private-dict-file=
			
 
				-
			
 
				-# Tells whether to store unknown words to indicated private dictionary in
			
 
				-# --spelling-private-dict-file option instead of raising a message.
			
 
				-spelling-store-unknown-words=no
			
 
				-
			
 
				-
			
 
				-[MISCELLANEOUS]
			
 
				-
			
 
				-# List of note tags to take in consideration, separated by a comma.
			
 
				-notes=FIXME,
			
 
				-      XXX,
			
 
				-      TODO
			
 
				-
			
 
				-
			
 
				-[SIMILARITIES]
			
 
				-
			
 
				-# Ignore comments when computing similarities.
			
 
				-ignore-comments=yes
			
 
				-
			
 
				-# Ignore docstrings when computing similarities.
			
 
				-ignore-docstrings=yes
			
 
				-
			
 
				-# Ignore imports when computing similarities.
			
 
				-ignore-imports=no
			
 
				-
			
 
				-# Minimum lines number of a similarity.
			
 
				-min-similarity-lines=4
			
 
				-
			
 
				-
			
 
				-[TYPECHECK]
			
 
				-
			
 
				-# List of decorators that produce context managers, such as
			
 
				-# contextlib.contextmanager. Add to this list to register other decorators that
			
 
				-# produce valid context managers.
			
 
				-contextmanager-decorators=contextlib.contextmanager
			
 
				-
			
 
				-# List of members which are set dynamically and missed by pylint inference
			
 
				-# system, and so shouldn't trigger E1101 when accessed. Python regular
			
 
				-# expressions are accepted.
			
 
				-generated-members=capnp.* cereal.* pygame.* zmq.* setproctitle.* smbus2.* usb1.* serial.* cv2.* ft4222.* carla.*
			
 
				-
			
 
				-# Tells whether missing members accessed in mixin class should be ignored. A
			
 
				-# mixin class is detected if its name ends with "mixin" (case insensitive).
			
 
				-ignore-mixin-members=yes
			
 
				-
			
 
				-# This flag controls whether pylint should warn about no-member and similar
			
 
				-# checks whenever an opaque object is returned when inferring. The inference
			
 
				-# can return multiple potential results while evaluating a Python object, but
			
 
				-# some branches might not be evaluated, which results in partial inference. In
			
 
				-# that case, it might be useful to still emit no-member and other checks for
			
 
				-# the rest of the inferred objects.
			
 
				-ignore-on-opaque-inference=yes
			
 
				-
			
 
				-# List of class names for which member attributes should not be checked (useful
			
 
				-# for classes with dynamically set attributes). This supports the use of
			
 
				-# qualified names.
			
 
				-ignored-classes=optparse.Values,thread._local,_thread._local
			
 
				-
			
 
				-# List of module names for which member attributes should not be checked
			
 
				-# (useful for modules/projects where namespaces are manipulated during runtime
			
 
				-# and thus existing member attributes cannot be deduced by static analysis. It
			
 
				-# supports qualified module names, as well as Unix pattern matching.
			
 
				-ignored-modules=flask setproctitle usb1 flask.ext.socketio smbus2 usb1.*
			
 
				-
			
 
				-# Show a hint with possible names when a member name was not found. The aspect
			
 
				-# of finding the hint is based on edit distance.
			
 
				-missing-member-hint=yes
			
 
				-
			
 
				-# The minimum edit distance a name should have in order to be considered a
			
 
				-# similar match for a missing member name.
			
 
				-missing-member-hint-distance=1
			
 
				-
			
 
				-# The total number of similar names that should be taken in consideration when
			
 
				-# showing a hint for a missing member.
			
 
				-missing-member-max-choices=1
			
 
				-
			
 
				-
			
 
				-[VARIABLES]
			
 
				-
			
 
				-# List of additional names supposed to be defined in builtins. Remember that
			
 
				-# you should avoid to define new builtins when possible.
			
 
				-additional-builtins=
			
 
				-
			
 
				-# Tells whether unused global variables should be treated as a violation.
			
 
				-allow-global-unused-variables=yes
			
 
				-
			
 
				-# List of strings which can identify a callback function by name. A callback
			
 
				-# name must start or end with one of those strings.
			
 
				-callbacks=cb_,
			
 
				-          _cb
			
 
				-
			
 
				-# A regular expression matching the name of dummy variables (i.e. expectedly
			
 
				-# not used).
			
 
				-dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
			
 
				-
			
 
				-# Argument names that match this expression will be ignored. Default to name
			
 
				-# with leading underscore
			
 
				-ignored-argument-names=_.*|^ignored_|^unused_
			
 
				-
			
 
				-# Tells whether we should check for unused import in __init__ files.
			
 
				-init-import=no
			
 
				-
			
 
				-# List of qualified module names which can have objects that can redefine
			
 
				-# builtins.
			
 
				-redefining-builtins-modules=six.moves,past.builtins,future.builtins
			
 
				-
			
 
				-
			
 
				-[FORMAT]
			
 
				-
			
 
				-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
			
 
				-expected-line-ending-format=
			
 
				-
			
 
				-# Regexp for a line that is allowed to be longer than the limit.
			
 
				-ignore-long-lines=^\s*(# )?<?https?://\S+>?$
			
 
				-
			
 
				-# Number of spaces of indent required inside a hanging  or continued line.
			
 
				-indent-after-paren=4
			
 
				-
			
 
				-# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
			
 
				-# tab).
			
 
				-indent-string='  '
			
 
				-
			
 
				-# Maximum number of characters on a single line.
			
 
				-max-line-length=150
			
 
				-
			
 
				-# Maximum number of lines in a module
			
 
				-max-module-lines=1000
			
 
				-
			
 
				-# Allow the body of a class to be on the same line as the declaration if body
			
 
				-# contains single statement.
			
 
				-single-line-class-stmt=no
			
 
				-
			
 
				-# Allow the body of an if to be on the same line as the test if there is no
			
 
				-# else.
			
 
				-single-line-if-stmt=no
			
 
				-
			
 
				-
			
 
				-[BASIC]
			
 
				-
			
 
				-# Naming style matching correct argument names
			
 
				-argument-naming-style=snake_case
			
 
				-
			
 
				-# Regular expression matching correct argument names. Overrides argument-
			
 
				-# naming-style
			
 
				-#argument-rgx=
			
 
				-
			
 
				-# Naming style matching correct attribute names
			
 
				-attr-naming-style=snake_case
			
 
				-
			
 
				-# Regular expression matching correct attribute names. Overrides attr-naming-
			
 
				-# style
			
 
				-#attr-rgx=
			
 
				-
			
 
				-# Bad variable names which should always be refused, separated by a comma
			
 
				-bad-names=foo,
			
 
				-          bar,
			
 
				-          baz,
			
 
				-          toto,
			
 
				-          tutu,
			
 
				-          tata
			
 
				-
			
 
				-# Naming style matching correct class attribute names
			
 
				-class-attribute-naming-style=any
			
 
				-
			
 
				-# Regular expression matching correct class attribute names. Overrides class-
			
 
				-# attribute-naming-style
			
 
				-#class-attribute-rgx=
			
 
				-
			
 
				-# Naming style matching correct class names
			
 
				-class-naming-style=PascalCase
			
 
				-
			
 
				-# Regular expression matching correct class names. Overrides class-naming-style
			
 
				-#class-rgx=
			
 
				-
			
 
				-# Naming style matching correct constant names
			
 
				-const-naming-style=UPPER_CASE
			
 
				-
			
 
				-# Regular expression matching correct constant names. Overrides const-naming-
			
 
				-# style
			
 
				-#const-rgx=
			
 
				-
			
 
				-# Minimum line length for functions/classes that require docstrings, shorter
			
 
				-# ones are exempt.
			
 
				-docstring-min-length=-1
			
 
				-
			
 
				-# Naming style matching correct function names
			
 
				-function-naming-style=snake_case
			
 
				-
			
 
				-# Regular expression matching correct function names. Overrides function-
			
 
				-# naming-style
			
 
				-#function-rgx=
			
 
				-
			
 
				-# Good variable names which should always be accepted, separated by a comma
			
 
				-good-names=i,
			
 
				-           j,
			
 
				-           k,
			
 
				-           ex,
			
 
				-           Run,
			
 
				-           _
			
 
				-
			
 
				-# Include a hint for the correct naming format with invalid-name
			
 
				-include-naming-hint=no
			
 
				-
			
 
				-# Naming style matching correct inline iteration names
			
 
				-inlinevar-naming-style=any
			
 
				-
			
 
				-# Regular expression matching correct inline iteration names. Overrides
			
 
				-# inlinevar-naming-style
			
 
				-#inlinevar-rgx=
			
 
				-
			
 
				-# Naming style matching correct method names
			
 
				-method-naming-style=snake_case
			
 
				-
			
 
				-# Regular expression matching correct method names. Overrides method-naming-
			
 
				-# style
			
 
				-#method-rgx=
			
 
				-
			
 
				-# Naming style matching correct module names
			
 
				-module-naming-style=snake_case
			
 
				-
			
 
				-# Regular expression matching correct module names. Overrides module-naming-
			
 
				-# style
			
 
				-#module-rgx=
			
 
				-
			
 
				-# Colon-delimited sets of names that determine each other's naming style when
			
 
				-# the name regexes allow several styles.
			
 
				-name-group=
			
 
				-
			
 
				-# Regular expression which should only match function or class names that do
			
 
				-# not require a docstring.
			
 
				-no-docstring-rgx=^_
			
 
				-
			
 
				-# List of decorators that produce properties, such as abc.abstractproperty. Add
			
 
				-# to this list to register other decorators that produce valid properties.
			
 
				-property-classes=abc.abstractproperty
			
 
				-
			
 
				-# Naming style matching correct variable names
			
 
				-variable-naming-style=snake_case
			
 
				-
			
 
				-# Regular expression matching correct variable names. Overrides variable-
			
 
				-# naming-style
			
 
				-#variable-rgx=
			
 
				-
			
 
				-
			
 
				-[DESIGN]
			
 
				-
			
 
				-# Maximum number of arguments for function / method
			
 
				-max-args=5
			
 
				-
			
 
				-# Maximum number of attributes for a class (see R0902).
			
 
				-max-attributes=7
			
 
				-
			
 
				-# Maximum number of boolean expressions in a if statement
			
 
				-max-bool-expr=5
			
 
				-
			
 
				-# Maximum number of branch for function / method body
			
 
				-max-branches=12
			
 
				-
			
 
				-# Maximum number of locals for function / method body
			
 
				-max-locals=15
			
 
				-
			
 
				-# Maximum number of parents for a class (see R0901).
			
 
				-max-parents=7
			
 
				-
			
 
				-# Maximum number of public methods for a class (see R0904).
			
 
				-max-public-methods=20
			
 
				-
			
 
				-# Maximum number of return / yield for function / method body
			
 
				-max-returns=6
			
 
				-
			
 
				-# Maximum number of statements in function / method body
			
 
				-max-statements=50
			
 
				-
			
 
				-# Minimum number of public methods for a class (see R0903).
			
 
				-min-public-methods=2
			
 
				-
			
 
				-
			
 
				-[CLASSES]
			
 
				-
			
 
				-# List of method names used to declare (i.e. assign) instance attributes.
			
 
				-defining-attr-methods=__init__,
			
 
				-                      __new__,
			
 
				-                      setUp
			
 
				-
			
 
				-# List of member names, which should be excluded from the protected access
			
 
				-# warning.
			
 
				-exclude-protected=_asdict,
			
 
				-                  _fields,
			
 
				-                  _replace,
			
 
				-                  _source,
			
 
				-                  _make
			
 
				-
			
 
				-# List of valid names for the first argument in a class method.
			
 
				-valid-classmethod-first-arg=cls
			
 
				-
			
 
				-# List of valid names for the first argument in a metaclass class method.
			
 
				-valid-metaclass-classmethod-first-arg=mcs
			
 
				-
			
 
				-
			
 
				-[IMPORTS]
			
 
				-
			
 
				-# Allow wildcard imports from modules that define __all__.
			
 
				-allow-wildcard-with-all=no
			
 
				-
			
 
				-# Analyse import fallback blocks. This can be used to support both Python 2 and
			
 
				-# 3 compatible code, which means that the block might have code that exists
			
 
				-# only in one or another interpreter, leading to false positives when analysed.
			
 
				-analyse-fallback-blocks=no
			
 
				-
			
 
				-# Deprecated modules which should not be used, separated by a comma
			
 
				-deprecated-modules=regsub,
			
 
				-                   TERMIOS,
			
 
				-                   Bastion,
			
 
				-                   rexec
			
 
				-
			
 
				-# Create a graph of external dependencies in the given file (report RP0402 must
			
 
				-# not be disabled)
			
 
				-ext-import-graph=
			
 
				-
			
 
				-# Create a graph of every (i.e. internal and external) dependencies in the
			
 
				-# given file (report RP0402 must not be disabled)
			
 
				-import-graph=
			
 
				-
			
 
				-# Create a graph of internal dependencies in the given file (report RP0402 must
			
 
				-# not be disabled)
			
 
				-int-import-graph=
			
 
				-
			
 
				-# Force import order to recognize a module as part of the standard
			
 
				-# compatibility libraries.
			
 
				-known-standard-library=
			
 
				-
			
 
				-# Force import order to recognize a module as part of a third party library.
			
 
				-known-third-party=enchant
			
 
				-
			
 
				-[STRING]
			
 
				-
			
 
				-# This flag controls whether the implicit-str-concat should generate a warning
			
 
				-# on implicit string concatenation in sequences defined over several lines.
			
 
				-check-str-concat-over-line-jumps=yes
			
 
				-
			
 
				-[EXCEPTIONS]
			
 
				-
			
 
				-# Exceptions that will emit a warning when being caught. Defaults to
			
 
				-# "Exception"
			
 
				-overgeneral-exceptions=builtins.Exception
			
--- a/README.md
+++ b/README.md
@@ -121,14 +121,14 @@ exo
 
				 
			
 
				 That's it! No configuration required - exo will automatically discover the other device(s).
			
 
				 
			
 
				-exo starts a ChatGPT-like WebUI (powered by [tinygrad tinychat](https://github.com/tinygrad/tinygrad/tree/master/examples/tinychat)) on http://localhost:8000
			
 
				+exo starts a ChatGPT-like WebUI (powered by [tinygrad tinychat](https://github.com/tinygrad/tinygrad/tree/master/examples/tinychat)) on http://localhost:52415
			
 
				 
			
 
				-For developers, exo also starts a ChatGPT-compatible API endpoint on http://localhost:8000/v1/chat/completions. Examples with curl:
			
 
				+For developers, exo also starts a ChatGPT-compatible API endpoint on http://localhost:52415/v1/chat/completions. Examples with curl:
			
 
				 
			
 
				 #### Llama 3.2 3B:
			
 
				 
			
 
				 ```sh
			
 
				-curl http://localhost:8000/v1/chat/completions \
			
 
				+curl http://localhost:52415/v1/chat/completions \
			
 
				   -H "Content-Type: application/json" \
			
 
				   -d '{
			
 
				      "model": "llama-3.2-3b",
			
@@ -140,7 +140,7 @@ curl http://localhost:8000/v1/chat/completions \
 
				 #### Llama 3.1 405B:
			
 
				 
			
 
				 ```sh
			
 
				-curl http://localhost:8000/v1/chat/completions \
			
 
				+curl http://localhost:52415/v1/chat/completions \
			
 
				   -H "Content-Type: application/json" \
			
 
				   -d '{
			
 
				      "model": "llama-3.1-405b",
			
@@ -152,7 +152,7 @@ curl http://localhost:8000/v1/chat/completions \
 
				 #### Llava 1.5 7B (Vision Language Model):
			
 
				 
			
 
				 ```sh
			
 
				-curl http://localhost:8000/v1/chat/completions \
			
 
				+curl http://localhost:52415/v1/chat/completions \
			
 
				   -H "Content-Type: application/json" \
			
 
				   -d '{
			
 
				      "model": "llava-1.5-7b-hf",
			
@@ -208,6 +208,12 @@ With a custom prompt:
 
				 exo run llama-3.2-3b --prompt "What is the meaning of exo?"
			
 
				 ```
			
 
				 
			
 
				+### Model Storage
			
 
				+
			
 
				+Models by default are stored in `~/.cache/huggingface/hub`.
			
 
				+
			
 
				+You can set a different model storage location by setting the `HF_HOME` env var.
			
 
				+
			
 
				 ## Debugging
			
 
				 
			
 
				 Enable debug logs with the DEBUG environment variable (0-9).
			
@@ -222,6 +228,20 @@ For the **tinygrad** inference engine specifically, there is a separate DEBUG fl
 
				 TINYGRAD_DEBUG=2 exo
			
 
				 ```
			
 
				 
			
 
				+## Formatting
			
 
				+
			
 
				+We use [yapf](https://github.com/google/yapf) to format the code. To format the code, first install the formatting requirements:
			
 
				+
			
 
				+```sh
			
 
				+pip3 install -e '.[formatting]'
			
 
				+```
			
 
				+
			
 
				+Then run the formatting script:
			
 
				+
			
 
				+```sh
			
 
				+python3 format.py ./exo
			
 
				+```
			
 
				+
			
 
				 ## Known Issues
			
 
				 
			
 
				 - On some versions of MacOS/Python, certificates are not installed properly which can lead to SSL errors (e.g. SSL error with huggingface.co). To fix this, run the Install Certificates command, usually:
			
--- a/configure_mlx.sh
+++ b/configure_mlx.sh
@@ -1,2 +1,18 @@
 
				-sudo sysctl iogpu.wired_lwm_mb=400000
			
 
				-sudo sysctl iogpu.wired_limit_mb=180000
			
 
				+#!/bin/bash
			
 
				+
			
 
				+# Get the total memory in MB
			
 
				+TOTAL_MEM_MB=$(($(sysctl -n hw.memsize) / 1024 / 1024))
			
 
				+
			
 
				+# Set WIRED_LIMIT_MB to 80%
			
 
				+WIRED_LIMIT_MB=$(($TOTAL_MEM_MB * 80 / 100))
			
 
				+# Set  WIRED_LWM_MB to 70%
			
 
				+WIRED_LWM_MB=$(($TOTAL_MEM_MB * 70 / 100))
			
 
				+
			
 
				+# Display the calculated values
			
 
				+echo "Total memory: $TOTAL_MEM_MB MB"
			
 
				+echo "Maximum limit (iogpu.wired_limit_mb): $WIRED_LIMIT_MB MB"
			
 
				+echo "Lower bound (iogpu.wired_lwm_mb): $WIRED_LWM_MB MB"
			
 
				+
			
 
				+# Apply the values with sysctl
			
 
				+sudo sysctl -w iogpu.wired_limit_mb=$WIRED_LIMIT_MB
			
 
				+sudo sysctl -w iogpu.wired_lwm_mb=$WIRED_LWM_MB
			
--- a/docs/exo-rounded.png
+++ b/docs/exo-rounded.png
--- a/examples/astra/astra/ContentView.swift
+++ b/examples/astra/astra/ContentView.swift
@@ -148,7 +148,7 @@ struct ContentView: View {
 
				     @State private var voiceActivityThreshold: Float = 0.40
			
 
				     @State private var silenceTimeThreshold = 1.0
			
 
				     @State private var debugText = ""
			
 
				-    @State private var apiEndpoint = "http://192.168.212.74:8000/v1/chat/completions"
			
 
				+    @State private var apiEndpoint = "http://192.168.212.74:52415/v1/chat/completions"
			
 
				     @State private var audioBuffer: [Float] = []
			
 
				     @State private var bufferDuration: Double = 0.5 // 0.5 seconds buffer
			
 
				     @State private var isInitialTranscription = true
			
--- a/examples/chatgpt_api.sh
+++ b/examples/chatgpt_api.sh
@@ -3,7 +3,7 @@
 
				 # This works the same in a single-node set up and in a multi-node setup.
			
 
				 # You need to start exo before running this by running `python3 main.py`.
			
 
				 
			
 
				-API_ENDPOINT="http://${API_ENDPOINT:-$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | awk '{print $2}' | head -n 1):8000}"
			
 
				+API_ENDPOINT="http://${API_ENDPOINT:-$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | awk '{print $2}' | head -n 1):52415}"
			
 
				 MODEL="llama-3.1-8b"
			
 
				 PROMPT="What is the meaning of exo?"
			
 
				 TEMPERATURE=0.7
			
--- a/exo/__init__.py
+++ b/exo/__init__.py
@@ -1 +1 @@
 
				-from exo.helpers import DEBUG as DEBUG, DEBUG_DISCOVERY as DEBUG_DISCOVERY, VERSION as VERSION
			
 
				+from exo.helpers import DEBUG as DEBUG, DEBUG_DISCOVERY as DEBUG_DISCOVERY, VERSION as VERSION
			
--- a/exo/api/chatgpt_api.py
+++ b/exo/api/chatgpt_api.py
@@ -8,15 +8,14 @@ from typing import List, Literal, Union, Dict
 
				 from aiohttp import web
			
 
				 import aiohttp_cors
			
 
				 import traceback
			
 
				+import signal
			
 
				 from exo import DEBUG, VERSION
			
 
				 from exo.download.download_progress import RepoProgressEvent
			
 
				-from exo.helpers import PrefixDict
			
 
				-from exo.inference.shard import Shard
			
 
				+from exo.helpers import PrefixDict, shutdown
			
 
				 from exo.inference.tokenizers import resolve_tokenizer
			
 
				 from exo.orchestration import Node
			
 
				-from exo.models import model_base_shards
			
 
				-from typing import Callable
			
 
				-
			
 
				+from exo.models import build_base_shard, model_cards, get_repo, pretty_name, get_supported_models
			
 
				+from typing import Callable, Optional
			
 
				 
			
 
				 class Message:
			
 
				   def __init__(self, role: str, content: Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]):
			
@@ -27,6 +26,7 @@ class Message:
 
				     return {"role": self.role, "content": self.content}
			
 
				 
			
 
				 
			
 
				+
			
 
				 class ChatCompletionRequest:
			
 
				   def __init__(self, model: str, messages: List[Message], temperature: float):
			
 
				     self.model = model
			
@@ -117,19 +117,11 @@ def remap_messages(messages: List[Message]) -> List[Message]:
 
				 def build_prompt(tokenizer, _messages: List[Message]):
			
 
				   messages = remap_messages(_messages)
			
 
				   prompt = tokenizer.apply_chat_template([m.to_dict() for m in messages], tokenize=False, add_generation_prompt=True)
			
 
				-  image_str = None
			
 
				   for message in messages:
			
 
				     if not isinstance(message.content, list):
			
 
				       continue
			
 
				 
			
 
				-    for content in message.content:
			
 
				-      # note: we only support one image at a time right now. Multiple is possible. See: https://github.com/huggingface/transformers/blob/e68ec18ce224af879f22d904c7505a765fb77de3/docs/source/en/model_doc/llava.md?plain=1#L41
			
 
				-      # follows the convention in https://platform.openai.com/docs/guides/vision
			
 
				-      if isinstance(content, dict) and content.get("type", None) == "image":
			
 
				-        image_str = content.get("image", None)
			
 
				-        break
			
 
				-
			
 
				-  return prompt, image_str
			
 
				+  return prompt
			
 
				 
			
 
				 
			
 
				 def parse_message(data: dict):
			
@@ -138,9 +130,9 @@ def parse_message(data: dict):
 
				   return Message(data["role"], data["content"])
			
 
				 
			
 
				 
			
 
				-def parse_chat_request(data: dict):
			
 
				+def parse_chat_request(data: dict, default_model: str):
			
 
				   return ChatCompletionRequest(
			
 
				-    data.get("model", "llama-3.1-8b"),
			
 
				+    data.get("model", default_model),
			
 
				     [parse_message(msg) for msg in data["messages"]],
			
 
				     data.get("temperature", 0.0),
			
 
				   )
			
@@ -152,9 +144,8 @@ class PromptSession:
 
				     self.timestamp = timestamp
			
 
				     self.prompt = prompt
			
 
				 
			
 
				-
			
 
				 class ChatGPTAPI:
			
 
				-  def __init__(self, node: Node, inference_engine_classname: str, response_timeout: int = 90, on_chat_completion_request: Callable[[str, ChatCompletionRequest, str], None] = None):
			
 
				+  def __init__(self, node: Node, inference_engine_classname: str, response_timeout: int = 90, on_chat_completion_request: Callable[[str, ChatCompletionRequest, str], None] = None, default_model: Optional[str] = None):
			
 
				     self.node = node
			
 
				     self.inference_engine_classname = inference_engine_classname
			
 
				     self.response_timeout = response_timeout
			
@@ -163,6 +154,8 @@ class ChatGPTAPI:
 
				     self.prompts: PrefixDict[str, PromptSession] = PrefixDict()
			
 
				     self.prev_token_lens: Dict[str, int] = {}
			
 
				     self.stream_tasks: Dict[str, asyncio.Task] = {}
			
 
				+    self.default_model = default_model or "llama-3.2-1b"
			
 
				+
			
 
				     cors = aiohttp_cors.setup(self.app)
			
 
				     cors_options = aiohttp_cors.ResourceOptions(
			
 
				       allow_credentials=True,
			
@@ -177,13 +170,24 @@ class ChatGPTAPI:
 
				     cors.add(self.app.router.add_post("/chat/completions", self.handle_post_chat_completions), {"*": cors_options})
			
 
				     cors.add(self.app.router.add_post("/v1/chat/completions", self.handle_post_chat_completions), {"*": cors_options})
			
 
				     cors.add(self.app.router.add_get("/v1/download/progress", self.handle_get_download_progress), {"*": cors_options})
			
 
				+    cors.add(self.app.router.add_get("/modelpool", self.handle_model_support), {"*": cors_options})
			
 
				+    cors.add(self.app.router.add_get("/healthcheck", self.handle_healthcheck), {"*": cors_options})
			
 
				+    cors.add(self.app.router.add_post("/quit", self.handle_quit), {"*": cors_options})
			
 
				 
			
 
				-    self.static_dir = Path(__file__).parent.parent / "tinychat"
			
 
				-    self.app.router.add_get("/", self.handle_root)
			
 
				-    self.app.router.add_static("/", self.static_dir, name="static")
			
 
				+    if "__compiled__" not in globals():
			
 
				+      self.static_dir = Path(__file__).parent.parent/"tinychat"
			
 
				+      self.app.router.add_get("/", self.handle_root)
			
 
				+      self.app.router.add_static("/", self.static_dir, name="static")
			
 
				 
			
 
				     self.app.middlewares.append(self.timeout_middleware)
			
 
				     self.app.middlewares.append(self.log_request)
			
 
				+  
			
 
				+  async def handle_quit(self, request):
			
 
				+    if DEBUG>=1: print("Received quit signal")
			
 
				+    response = web.json_response({"detail": "Quit signal received"}, status=200)
			
 
				+    await response.prepare(request)
			
 
				+    await response.write_eof()
			
 
				+    await shutdown(signal.SIGINT, asyncio.get_event_loop(), self.node.server)
			
 
				 
			
 
				   async def timeout_middleware(self, app, handler):
			
 
				     async def middleware(request):
			
@@ -191,6 +195,7 @@ class ChatGPTAPI:
 
				         return await asyncio.wait_for(handler(request), timeout=self.response_timeout)
			
 
				       except asyncio.TimeoutError:
			
 
				         return web.json_response({"detail": "Request timed out"}, status=408)
			
 
				+
			
 
				     return middleware
			
 
				 
			
 
				   async def log_request(self, app, handler):
			
@@ -203,14 +208,25 @@ class ChatGPTAPI:
 
				   async def handle_root(self, request):
			
 
				     return web.FileResponse(self.static_dir/"index.html")
			
 
				 
			
 
				+  async def handle_healthcheck(self, request):
			
 
				+    return web.json_response({"status": "ok"})
			
 
				+
			
 
				+  async def handle_model_support(self, request):
			
 
				+    return web.json_response({
			
 
				+      "model pool": {
			
 
				+        model_name: pretty_name.get(model_name, model_name) 
			
 
				+        for model_name in get_supported_models(self.node.topology_inference_engines_pool)
			
 
				+      }
			
 
				+    })
			
 
				+  
			
 
				   async def handle_get_models(self, request):
			
 
				-    return web.json_response([{"id": model_name, "object": "model", "owned_by": "exo", "ready": True } for model_name, _ in model_base_shards.items()])
			
 
				+    return web.json_response([{"id": model_name, "object": "model", "owned_by": "exo", "ready": True} for model_name, _ in model_cards.items()])
			
 
				 
			
 
				   async def handle_post_chat_token_encode(self, request):
			
 
				     data = await request.json()
			
 
				-    shard = model_base_shards.get(data.get("model", "llama-3.1-8b"), {}).get(self.inference_engine_classname)
			
 
				+    shard = build_base_shard(self.default_model, self.inference_engine_classname)
			
 
				     messages = [parse_message(msg) for msg in data.get("messages", [])]
			
 
				-    tokenizer = await resolve_tokenizer(shard.model_id)
			
 
				+    tokenizer = await resolve_tokenizer(get_repo(shard.model_id, self.inference_engine_classname))
			
 
				     return web.json_response({"length": len(build_prompt(tokenizer, messages)[0])})
			
 
				 
			
 
				   async def handle_get_download_progress(self, request):
			
@@ -222,29 +238,28 @@ class ChatGPTAPI:
 
				         print(f"Unknown progress event type: {type(progress_event)}. {progress_event}")
			
 
				     return web.json_response(progress_data)
			
 
				 
			
 
				-
			
 
				   async def handle_post_chat_completions(self, request):
			
 
				     data = await request.json()
			
 
				     if DEBUG >= 2: print(f"Handling chat completions request from {request.remote}: {data}")
			
 
				     stream = data.get("stream", False)
			
 
				-    chat_request = parse_chat_request(data)
			
 
				-    if chat_request.model and chat_request.model.startswith("gpt-"):  # to be compatible with ChatGPT tools, point all gpt- model requests to llama instead
			
 
				-      chat_request.model = "llama-3.1-8b"
			
 
				-    if not chat_request.model or chat_request.model not in model_base_shards:
			
 
				-      if DEBUG >= 1: print(f"Invalid model: {chat_request.model}. Supported: {list(model_base_shards.keys())}. Defaulting to llama-3.1-8b")
			
 
				-      chat_request.model = "llama-3.1-8b"
			
 
				-    shard = model_base_shards[chat_request.model].get(self.inference_engine_classname, None)
			
 
				+    chat_request = parse_chat_request(data, self.default_model)
			
 
				+    if chat_request.model and chat_request.model.startswith("gpt-"):  # to be compatible with ChatGPT tools, point all gpt- model requests to default model
			
 
				+      chat_request.model = self.default_model
			
 
				+    if not chat_request.model or chat_request.model not in model_cards:
			
 
				+      if DEBUG >= 1: print(f"Invalid model: {chat_request.model}. Supported: {list(model_cards.keys())}. Defaulting to {self.default_model}")
			
 
				+      chat_request.model = self.default_model
			
 
				+    shard = build_base_shard(chat_request.model, self.inference_engine_classname)
			
 
				     if not shard:
			
 
				-      supported_models = [model for model, engines in model_base_shards.items() if self.inference_engine_classname in engines]
			
 
				+      supported_models = [model for model, info in model_cards.items() if self.inference_engine_classname in info.get("repo", {})]
			
 
				       return web.json_response(
			
 
				         {"detail": f"Unsupported model: {chat_request.model} with inference engine {self.inference_engine_classname}. Supported models for this engine: {supported_models}"},
			
 
				         status=400,
			
 
				       )
			
 
				 
			
 
				-    tokenizer = await resolve_tokenizer(shard.model_id)
			
 
				+    tokenizer = await resolve_tokenizer(get_repo(shard.model_id, self.inference_engine_classname))
			
 
				     if DEBUG >= 4: print(f"Resolved tokenizer: {tokenizer}")
			
 
				 
			
 
				-    prompt, image_str = build_prompt(tokenizer, chat_request.messages)
			
 
				+    prompt = build_prompt(tokenizer, chat_request.messages)
			
 
				     request_id = str(uuid.uuid4())
			
 
				     if self.on_chat_completion_request:
			
 
				       try:
			
@@ -267,13 +282,10 @@ class ChatGPTAPI:
 
				     callback_id = f"chatgpt-api-wait-response-{request_id}"
			
 
				     callback = self.node.on_token.register(callback_id)
			
 
				 
			
 
				-    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=} {image_str=}")
			
 
				+    if DEBUG >= 2: print(f"Sending prompt from ChatGPT api {request_id=} {shard=} {prompt=}")
			
 
				 
			
 
				     try:
			
 
				-      await asyncio.wait_for(
			
 
				-        asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, image_str, request_id=request_id))),
			
 
				-        timeout=self.response_timeout
			
 
				-      )
			
 
				+      await asyncio.wait_for(asyncio.shield(asyncio.create_task(self.node.process_prompt(shard, prompt, request_id=request_id))), timeout=self.response_timeout)
			
 
				 
			
 
				       if DEBUG >= 2: print(f"Waiting for response to finish. timeout={self.response_timeout}s")
			
 
				 
			
@@ -356,7 +368,7 @@ class ChatGPTAPI:
 
				       deregistered_callback = self.node.on_token.deregister(callback_id)
			
 
				       if DEBUG >= 2: print(f"Deregister {callback_id=} {deregistered_callback=}")
			
 
				 
			
 
				-  async def run(self, host: str = "0.0.0.0", port: int = 8000):
			
 
				+  async def run(self, host: str = "0.0.0.0", port: int = 52415):
			
 
				     runner = web.AppRunner(self.app)
			
 
				     await runner.setup()
			
 
				     site = web.TCPSite(runner, host, port)
			
--- a/exo/download/hf/hf_helpers.py
+++ b/exo/download/hf/hf_helpers.py
@@ -1,7 +1,11 @@
 
				+import aiofiles.os as aios
			
 
				+from typing import Union
			
 
				 import asyncio
			
 
				 import aiohttp
			
 
				 import json
			
 
				 import os
			
 
				+import sys
			
 
				+import shutil
			
 
				 from urllib.parse import urljoin
			
 
				 from typing import Callable, Optional, Coroutine, Any, Dict, List, Union, Literal
			
 
				 from datetime import datetime, timedelta
			
@@ -9,7 +13,7 @@ from fnmatch import fnmatch
 
				 from pathlib import Path
			
 
				 from typing import Generator, Iterable, TypeVar, TypedDict
			
 
				 from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
			
 
				-from exo.helpers import DEBUG
			
 
				+from exo.helpers import DEBUG, is_frozen
			
 
				 from exo.download.download_progress import RepoProgressEvent, RepoFileProgressEvent, RepoProgressCallback, RepoFileProgressCallback
			
 
				 from exo.inference.shard import Shard
			
 
				 import aiofiles
			
@@ -17,7 +21,6 @@ from aiofiles import os as aios
 
				 
			
 
				 T = TypeVar("T")
			
 
				 
			
 
				-
			
 
				 async def get_local_snapshot_dir(repo_id: str, revision: str = "main") -> Optional[Path]:
			
 
				   refs_dir = get_repo_root(repo_id)/"refs"
			
 
				   refs_file = refs_dir/revision
			
@@ -70,8 +73,10 @@ def _add_wildcard_to_directories(pattern: str) -> str:
 
				     return pattern + "*"
			
 
				   return pattern
			
 
				 
			
 
				+
			
 
				 def get_hf_endpoint() -> str:
			
 
				-    return os.environ.get('HF_ENDPOINT', "https://huggingface.co")
			
 
				+  return os.environ.get('HF_ENDPOINT', "https://huggingface.co")
			
 
				+
			
 
				 
			
 
				 def get_hf_home() -> Path:
			
 
				   """Get the Hugging Face home directory."""
			
@@ -97,9 +102,22 @@ async def get_auth_headers():
 
				 
			
 
				 def get_repo_root(repo_id: str) -> Path:
			
 
				   """Get the root directory for a given repo ID in the Hugging Face cache."""
			
 
				-  sanitized_repo_id = repo_id.replace("/", "--")
			
 
				+  sanitized_repo_id = str(repo_id).replace("/", "--")
			
 
				   return get_hf_home()/"hub"/f"models--{sanitized_repo_id}"
			
 
				 
			
 
				+async def move_models_to_hf(seed_dir: Union[str, Path]):
			
 
				+  """Move model in resources folder of app to .cache/huggingface/hub"""
			
 
				+  source_dir = Path(seed_dir)
			
 
				+  dest_dir = get_hf_home()/"hub"
			
 
				+  await aios.makedirs(dest_dir, exist_ok=True)
			
 
				+  async for path in source_dir.iterdir():
			
 
				+    if path.is_dir() and path.startswith("models--"):
			
 
				+      dest_path = dest_dir / path.name
			
 
				+      if dest_path.exists():
			
 
				+        if DEBUG>=1: print(f"skipping moving {dest_path}. File already exists")
			
 
				+      else:
			
 
				+        await aios.rename(str(path), str(dest_path))
			
 
				+        
			
 
				 
			
 
				 async def fetch_file_list(session, repo_id, revision, path=""):
			
 
				   api_url = f"{get_hf_endpoint()}/api/models/{repo_id}/tree/{revision}"
			
@@ -394,7 +412,7 @@ def extract_layer_num(tensor_name: str) -> Optional[int]:
 
				 
			
 
				 
			
 
				 def get_allow_patterns(weight_map: Dict[str, str], shard: Shard) -> List[str]:
			
 
				-  default_patterns = set(["*.json","*.py","tokenizer.model","*.tiktoken","*.txt"])
			
 
				+  default_patterns = set(["*.json", "*.py", "tokenizer.model", "*.tiktoken", "*.txt"])
			
 
				   shard_specific_patterns = set()
			
 
				   if weight_map:
			
 
				     for tensor_name, filename in weight_map.items():
			
@@ -407,6 +425,16 @@ def get_allow_patterns(weight_map: Dict[str, str], shard: Shard) -> List[str]:
 
				     elif shard.is_last_layer():
			
 
				       shard_specific_patterns.add(sorted_file_names[-1])
			
 
				   else:
			
 
				-    shard_specific_patterns = set("*.safetensors")
			
 
				+    shard_specific_patterns = set(["*.safetensors"])
			
 
				   if DEBUG >= 2: print(f"get_allow_patterns {weight_map=} {shard=} {shard_specific_patterns=}")
			
 
				   return list(default_patterns | shard_specific_patterns)
			
 
				+
			
 
				+async def has_hf_home_read_access() -> bool:
			
 
				+  hf_home = get_hf_home()
			
 
				+  try: return await aios.access(hf_home, os.R_OK)
			
 
				+  except OSError: return False
			
 
				+
			
 
				+async def has_hf_home_write_access() -> bool:
			
 
				+  hf_home = get_hf_home()
			
 
				+  try: return await aios.access(hf_home, os.W_OK)
			
 
				+  except OSError: return False
			
--- a/exo/download/hf/hf_shard_download.py
+++ b/exo/download/hf/hf_shard_download.py
@@ -7,6 +7,7 @@ from exo.download.shard_download import ShardDownloader
 
				 from exo.download.download_progress import RepoProgressEvent
			
 
				 from exo.download.hf.hf_helpers import download_repo_files, RepoProgressEvent, get_weight_map, get_allow_patterns, get_repo_root
			
 
				 from exo.helpers import AsyncCallbackSystem, DEBUG
			
 
				+from exo.models import model_cards, get_repo
			
 
				 
			
 
				 
			
 
				 class HFShardDownloader(ShardDownloader):
			
@@ -17,11 +18,12 @@ class HFShardDownloader(ShardDownloader):
 
				     self.completed_downloads: Dict[Shard, Path] = {}
			
 
				     self._on_progress = AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]()
			
 
				 
			
 
				-  async def ensure_shard(self, shard: Shard) -> Path:
			
 
				+  async def ensure_shard(self, shard: Shard, inference_engine_name: str) -> Path:
			
 
				+    repo_name = get_repo(shard.model_id, inference_engine_name)
			
 
				     if shard in self.completed_downloads:
			
 
				       return self.completed_downloads[shard]
			
 
				     if self.quick_check:
			
 
				-      repo_root = get_repo_root(shard.model_id)
			
 
				+      repo_root = get_repo_root(repo_name)
			
 
				       snapshots_dir = repo_root/"snapshots"
			
 
				       if snapshots_dir.exists():
			
 
				         visible_dirs = [d for d in snapshots_dir.iterdir() if not d.name.startswith('.')]
			
@@ -51,7 +53,7 @@ class HFShardDownloader(ShardDownloader):
 
				     self.active_downloads = {active_shard: task for active_shard, task in self.active_downloads.items() if active_shard.model_id != shard.model_id}
			
 
				 
			
 
				     # Start new download
			
 
				-    download_task = asyncio.create_task(self._download_shard(shard))
			
 
				+    download_task = asyncio.create_task(self._download_shard(shard, repo_name))
			
 
				     self.active_downloads[shard] = download_task
			
 
				     try:
			
 
				       path = await download_task
			
@@ -63,14 +65,14 @@ class HFShardDownloader(ShardDownloader):
 
				       if shard in self.active_downloads:
			
 
				         self.active_downloads.pop(shard)
			
 
				 
			
 
				-  async def _download_shard(self, shard: Shard) -> Path:
			
 
				+  async def _download_shard(self, shard: Shard, repo_name: str) -> Path:
			
 
				     async def wrapped_progress_callback(event: RepoProgressEvent):
			
 
				       self._on_progress.trigger_all(shard, event)
			
 
				 
			
 
				-    weight_map = await get_weight_map(shard.model_id)
			
 
				+    weight_map = await get_weight_map(repo_name)
			
 
				     allow_patterns = get_allow_patterns(weight_map, shard)
			
 
				 
			
 
				-    return await download_repo_files(repo_id=shard.model_id, progress_callback=wrapped_progress_callback, allow_patterns=allow_patterns, max_parallel_downloads=self.max_parallel_downloads)
			
 
				+    return await download_repo_files(repo_name, progress_callback=wrapped_progress_callback, allow_patterns=allow_patterns, max_parallel_downloads=self.max_parallel_downloads)
			
 
				 
			
 
				   @property
			
 
				   def on_progress(self) -> AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]:
			
--- a/exo/download/shard_download.py
+++ b/exo/download/shard_download.py
@@ -8,7 +8,7 @@ from exo.helpers import AsyncCallbackSystem
 
				 
			
 
				 class ShardDownloader(ABC):
			
 
				   @abstractmethod
			
 
				-  async def ensure_shard(self, shard: Shard) -> Path:
			
 
				+  async def ensure_shard(self, shard: Shard, inference_engine_name: str) -> Path:
			
 
				     """
			
 
				         Ensures that the shard is downloaded.
			
 
				         Does not allow multiple overlapping downloads at once.
			
@@ -17,6 +17,7 @@ class ShardDownloader(ABC):
 
				 
			
 
				         Args:
			
 
				             shard (Shard): The shard to download.
			
 
				+            inference_engine_name (str): The inference engine used on the node hosting the shard
			
 
				         """
			
 
				     pass
			
 
				 
			
@@ -25,8 +26,9 @@ class ShardDownloader(ABC):
 
				   def on_progress(self) -> AsyncCallbackSystem[str, Tuple[Shard, RepoProgressEvent]]:
			
 
				     pass
			
 
				 
			
 
				+
			
 
				 class NoopShardDownloader(ShardDownloader):
			
 
				-  async def ensure_shard(self, shard: Shard) -> Path:
			
 
				+  async def ensure_shard(self, shard: Shard, inference_engine_name: str) -> Path:
			
 
				     return Path("/tmp/noop_shard")
			
 
				 
			
 
				   @property
			
--- a/exo/helpers.py
+++ b/exo/helpers.py
@@ -1,4 +1,5 @@
 
				 import os
			
 
				+import sys
			
 
				 import asyncio
			
 
				 from typing import Callable, TypeVar, Optional, Dict, Generic, Tuple, List
			
 
				 import socket
			
@@ -170,7 +171,7 @@ def is_valid_uuid(val):
 
				 
			
 
				 
			
 
				 def get_or_create_node_id():
			
 
				-  NODE_ID_FILE = Path(tempfile.gettempdir()) / ".exo_node_id"
			
 
				+  NODE_ID_FILE = Path(tempfile.gettempdir())/".exo_node_id"
			
 
				   try:
			
 
				     if NODE_ID_FILE.is_file():
			
 
				       with open(NODE_ID_FILE, "r") as f:
			
@@ -234,3 +235,22 @@ def get_all_ip_addresses():
 
				   except:
			
 
				     if DEBUG >= 1: print("Failed to get all IP addresses. Defaulting to localhost.")
			
 
				     return ["localhost"]
			
 
				+
			
 
				+
			
 
				+async def shutdown(signal, loop, server):
			
 
				+  """Gracefully shutdown the server and close the asyncio loop."""
			
 
				+  print(f"Received exit signal {signal.name}...")
			
 
				+  print("Thank you for using exo.")
			
 
				+  print_yellow_exo()
			
 
				+  server_tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
			
 
				+  [task.cancel() for task in server_tasks]
			
 
				+  print(f"Cancelling {len(server_tasks)} outstanding tasks")
			
 
				+  await asyncio.gather(*server_tasks, return_exceptions=True)
			
 
				+  await server.stop()
			
 
				+  loop.stop()
			
 
				+
			
 
				+
			
 
				+def is_frozen():
			
 
				+  return getattr(sys, 'frozen', False) or os.path.basename(sys.executable) == "exo" \
			
 
				+    or ('Contents/MacOS' in str(os.path.dirname(sys.executable))) \
			
 
				+    or '__nuitka__' in globals() or getattr(sys, '__compiled__', False)
			
--- a/exo/inference/debug_inference_engine.py
+++ b/exo/inference/debug_inference_engine.py
@@ -13,32 +13,31 @@ async def test_inference_engine(inference_engine_1: InferenceEngine, inference_e
 
				   _tokenizer = Tokenizer(str(Path(model_id)/"tokenizer.model"))
			
 
				 
			
 
				   prompt = "In a single word only, what is the last name of the president of the United States? "
			
 
				-  resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
			
 
				-  next_resp_full, _next_inference_state_full, _ = await inference_engine_1.infer_tensor(
			
 
				+  resp_full = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32), prompt=prompt)
			
 
				+  token_full = await inference_engine_1.sample(resp_full)
			
 
				+
			
 
				+  next_resp_full = await inference_engine_1.infer_tensor(
			
 
				     "A",
			
 
				     shard=Shard(model_id=model_id, start_layer=0, end_layer=31, n_layers=32),
			
 
				-    input_data=resp_full,
			
 
				-    inference_state=inference_state_full,
			
 
				+    input_data=token_full,
			
 
				   )
			
 
				 
			
 
				-  resp1, inference_state_1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
			
 
				-  resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(
			
 
				+  resp1 = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32), prompt=prompt)
			
 
				+  resp2 = await inference_engine_2.infer_tensor(
			
 
				     "B",
			
 
				     shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
			
 
				     input_data=resp1,
			
 
				-    inference_state=inference_state_1,
			
 
				   )
			
 
				-  resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(
			
 
				+  token2 = await inference_engine_2.sample(resp2)
			
 
				+  resp3 = await inference_engine_1.infer_tensor(
			
 
				     "B",
			
 
				     shard=Shard(model_id=model_id, start_layer=0, end_layer=30, n_layers=32),
			
 
				-    input_data=resp2,
			
 
				-    inference_state=inference_state_2,
			
 
				+    input_data=token2,
			
 
				   )
			
 
				-  resp4, _inference_state_4, _ = await inference_engine_2.infer_tensor(
			
 
				+  resp4 = await inference_engine_2.infer_tensor(
			
 
				     "B",
			
 
				     shard=Shard(model_id=model_id, start_layer=31, end_layer=31, n_layers=32),
			
 
				     input_data=resp3,
			
 
				-    inference_state=inference_state_3,
			
 
				   )
			
 
				 
			
 
				   print(f"{resp2=}")
			
--- a/exo/inference/dummy_inference_engine.py
+++ b/exo/inference/dummy_inference_engine.py
@@ -1,59 +1,38 @@
 
				 from typing import Optional, Tuple, TYPE_CHECKING
			
 
				 import numpy as np
			
 
				+import random
			
 
				+import string
			
 
				 import asyncio
			
 
				 import json
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				 from exo.inference.shard import Shard
			
 
				+def random_string(length: int):
			
 
				+  return ''.join([random.choice(string.ascii_lowercase) for i in range(length)])
			
 
				+  
			
 
				 
			
 
				 class DummyInferenceEngine(InferenceEngine):
			
 
				   def __init__(self):
			
 
				     self.shard = None
			
 
				     self.vocab_size = 1000
			
 
				+    self.hidden_size = 256
			
 
				     self.eos_token_id = 0
			
 
				     self.latency_mean = 0.1
			
 
				     self.latency_stddev = 0.02
			
 
				 
			
 
				-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				-    try:
			
 
				-      await self.ensure_shard(shard)
			
 
				+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
			
 
				+    return np.random.randint(1, self.vocab_size, size=(1, len(prompt.split())))
			
 
				+  
			
 
				+  async def sample(self, x: np.ndarray) -> np.ndarray:
			
 
				+    return np.random.randint(1, self.vocab_size)
			
 
				 
			
 
				-      # Generate random tokens
			
 
				-      output_length = np.random.randint(1, 10)
			
 
				-      output = np.random.randint(1, self.vocab_size, size=(1, output_length))
			
 
				+  async def decode(self, shard: Shard, tokens: np.ndarray) -> str:
			
 
				+    return ' '.join([random_string(np.random.randint(1, 34)) for token in tokens])
			
 
				 
			
 
				-      # Simulate latency
			
 
				-      await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
			
 
				-
			
 
				-      # Randomly decide if finished
			
 
				-      is_finished = np.random.random() < 0.2
			
 
				-      if is_finished:
			
 
				-        output = np.array([[self.eos_token_id]])
			
 
				-
			
 
				-      new_state = json.dumps({"dummy_state": "some_value"})
			
 
				-
			
 
				-      return output, new_state, is_finished
			
 
				-    except Exception as e:
			
 
				-      print(f"Error in DummyInferenceEngine.infer_prompt: {str(e)}")
			
 
				-      return np.array([[self.eos_token_id]]), json.dumps({"error": str(e)}), True
			
 
				-
			
 
				-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				     await self.ensure_shard(shard)
			
 
				-    state = json.loads(inference_state or "{}")
			
 
				-    start_pos = state.get("start_pos", 0)
			
 
				-
			
 
				-    output_length = np.random.randint(1, 10)
			
 
				-    output = np.random.randint(1, self.vocab_size, size=(1, output_length))
			
 
				-
			
 
				-    await asyncio.sleep(max(0, np.random.normal(self.latency_mean, self.latency_stddev)))
			
 
				-
			
 
				-    is_finished = np.random.random() < 0.2
			
 
				-    if is_finished:
			
 
				-      output = np.array([[self.eos_token_id]])
			
 
				-
			
 
				-    start_pos += input_data.shape[1] + output_length
			
 
				-    new_state = json.dumps({"start_pos": start_pos})
			
 
				-
			
 
				-    return output, new_state, is_finished
			
 
				+    sequence_length = input_data.shape[0 if self.shard.is_first_layer() else 1]
			
 
				+    output = np.random.random(size=(1, sequence_length, self.vocab_size if self.shard.is_last_layer() else self.hidden_size))
			
 
				+    return output
			
 
				 
			
 
				   async def ensure_shard(self, shard: Shard):
			
 
				     if self.shard == shard:
			
--- a/exo/inference/inference_engine.py
+++ b/exo/inference/inference_engine.py
@@ -9,13 +9,32 @@ from .shard import Shard
 
				 
			
 
				 class InferenceEngine(ABC):
			
 
				   @abstractmethod
			
 
				-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
			
 
				+    pass
			
 
				+  
			
 
				+  @abstractmethod
			
 
				+  async def sample(self, x: np.ndarray) -> np.ndarray:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				+  async def decode(self, shard: Shard, tokens: np.ndarray) -> str:
			
 
				     pass
			
 
				 
			
 
				+  @abstractmethod
			
 
				+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				+    pass
			
 
				+  
			
 
				+  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str) -> np.ndarray:
			
 
				+    tokens = await self.encode(shard, prompt)
			
 
				+    x = tokens.reshape(1, -1)
			
 
				+    output_data = await self.infer_tensor(request_id, shard, x)
			
 
				+    return output_data 
			
 
				+
			
 
				+inference_engine_classes = {
			
 
				+  "mlx": "MLXDynamicShardInferenceEngine",
			
 
				+  "tinygrad": "TinygradDynamicShardInferenceEngine",
			
 
				+  "dummy": "DummyInferenceEngine",
			
 
				+}
			
 
				 
			
 
				 def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDownloader'):
			
 
				   if DEBUG >= 2:
			
@@ -33,4 +52,4 @@ def get_inference_engine(inference_engine_name: str, shard_downloader: 'ShardDow
 
				   elif inference_engine_name == "dummy":
			
 
				     from exo.inference.dummy_inference_engine import DummyInferenceEngine
			
 
				     return DummyInferenceEngine()
			
 
				-  raise ValueError(f"Unsupported inference engine: {inference_engine_name}")
			
 
				+  raise ValueError(f"Unsupported inference engine: {inference_engine_name}")
			
--- a/exo/inference/mlx/models/base.py
+++ b/exo/inference/mlx/models/base.py
@@ -1,7 +1,7 @@
 
				 from typing import Optional
			
 
				 import mlx.core as mx
			
 
				 import mlx.nn as nn
			
 
				-from mlx_lm.models.base import KVCache
			
 
				+from mlx_lm.models.cache import KVCache
			
 
				 
			
 
				 
			
 
				 class IdentityBlock(nn.Module):
			
--- a/exo/inference/mlx/models/deepseek_v2.py
+++ b/exo/inference/mlx/models/deepseek_v2.py
@@ -4,7 +4,7 @@ from typing import Optional
 
				 import mlx.core as mx
			
 
				 import mlx.nn as nn
			
 
				 
			
 
				-from mlx_lm.models.base import KVCache
			
 
				+from mlx_lm.models.cache import KVCache
			
 
				 from mlx_lm.models.deepseek_v2 import ModelArgs, DeepseekV2DecoderLayer
			
 
				 from .base import IdentityBlock
			
 
				 from exo.inference.shard import Shard
			
--- a/exo/inference/mlx/models/gemma2.py
+++ b/exo/inference/mlx/models/gemma2.py
@@ -0,0 +1,118 @@
 
				+from dataclasses import dataclass, field
			
 
				+
			
 
				+import mlx.core as mx
			
 
				+import mlx.nn as nn
			
 
				+
			
 
				+from mlx_lm.models.base import create_attention_mask
			
 
				+from mlx_lm.models.gemma2 import TransformerBlock, ModelArgs, RMSNorm
			
 
				+
			
 
				+from ...shard import Shard
			
 
				+from .base import IdentityBlock
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class ModelArgs(ModelArgs):
			
 
				+  shard: Shard = field(default_factory=lambda: Shard("", 0, 0, 0))
			
 
				+
			
 
				+  def __post_init__(self):
			
 
				+    if isinstance(self.shard, Shard):
			
 
				+      return
			
 
				+    if not isinstance(self.shard, dict):
			
 
				+      raise TypeError(f"Expected shard to be a Shard instance or a dict, got {type(self.shard)} instead")
			
 
				+
			
 
				+    self.shard = Shard(**self.shard)
			
 
				+
			
 
				+
			
 
				+class GemmaModel(nn.Module):
			
 
				+  def __init__(self, args: ModelArgs):
			
 
				+    super().__init__()
			
 
				+    self.args = args
			
 
				+    self.vocab_size = args.vocab_size
			
 
				+    self.num_hidden_layers = args.num_hidden_layers
			
 
				+    assert self.vocab_size > 0
			
 
				+    if args.shard.is_first_layer() or args.shard.is_last_layer():
			
 
				+      self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
			
 
				+    self.layers = []
			
 
				+    for i in range(self.num_hidden_layers):
			
 
				+      if args.shard.start_layer <= i <= args.shard.end_layer:
			
 
				+        self.layers.append(TransformerBlock(args=args))
			
 
				+      else:
			
 
				+        self.layers.append(IdentityBlock())
			
 
				+    if args.shard.is_last_layer():
			
 
				+      self.norm = RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
			
 
				+
			
 
				+  def __call__(
			
 
				+    self,
			
 
				+    inputs: mx.array,
			
 
				+    cache=None,
			
 
				+  ):
			
 
				+    if self.args.shard.is_first_layer():
			
 
				+      h = self.embed_tokens(inputs)
			
 
				+      h = h * (self.args.hidden_size**0.5)
			
 
				+    else:
			
 
				+      h = inputs
			
 
				+
			
 
				+    mask = None
			
 
				+    if h.ndim > 1 and h.shape[1] > 1:
			
 
				+      mask = create_attention_mask(h, cache)
			
 
				+
			
 
				+    if cache is None:
			
 
				+      cache = [None]*len(self.layers)
			
 
				+
			
 
				+    for layer, c in zip(self.layers, cache):
			
 
				+      h = layer(h, mask, cache=c)
			
 
				+
			
 
				+    if self.args.shard.is_last_layer():
			
 
				+      h = self.norm(h)
			
 
				+    return h
			
 
				+
			
 
				+
			
 
				+class Model(nn.Module):
			
 
				+  def __init__(self, args: ModelArgs):
			
 
				+    super().__init__()
			
 
				+    self.args = args
			
 
				+    self.model_type = args.model_type
			
 
				+    self.model = GemmaModel(args)
			
 
				+    if args.shard.is_last_layer():
			
 
				+      self.final_logit_softcapping = args.final_logit_softcapping
			
 
				+
			
 
				+  def __call__(
			
 
				+    self,
			
 
				+    inputs: mx.array,
			
 
				+    cache=None,
			
 
				+  ):
			
 
				+    out = self.model(inputs, cache)
			
 
				+    if self.args.shard.is_last_layer():
			
 
				+      out = self.model.embed_tokens.as_linear(out)
			
 
				+      out = mx.tanh(out / self.final_logit_softcapping)
			
 
				+      out = out * self.final_logit_softcapping
			
 
				+    return out
			
 
				+
			
 
				+  def sanitize(self, weights):
			
 
				+    shard_state_dict = {}
			
 
				+
			
 
				+    for key, value in weights.items():
			
 
				+      if "self_attn.rotary_emb.inv_freq" in key:
			
 
				+        continue
			
 
				+      if key.startswith('model.layers.'):
			
 
				+        layer_num = int(key.split('.')[2])
			
 
				+        if self.args.shard.start_layer <= layer_num <= self.args.shard.end_layer:
			
 
				+          shard_state_dict[key] = value
			
 
				+      elif (self.args.shard.is_first_layer() or self.args.shard.is_last_layer()) and key.startswith('model.embed_tokens'):
			
 
				+        shard_state_dict[key] = value
			
 
				+      elif self.args.shard.is_last_layer() and (key.startswith('model.norm')):
			
 
				+        shard_state_dict[key] = value
			
 
				+
			
 
				+    return shard_state_dict
			
 
				+
			
 
				+  @property
			
 
				+  def layers(self):
			
 
				+    return self.model.layers
			
 
				+
			
 
				+  @property
			
 
				+  def head_dim(self):
			
 
				+    return self.args.head_dim
			
 
				+
			
 
				+  @property
			
 
				+  def n_kv_heads(self):
			
 
				+    return self.args.num_key_value_heads
			
--- a/exo/inference/mlx/models/qwen2.py
+++ b/exo/inference/mlx/models/qwen2.py
@@ -24,6 +24,7 @@ class ModelArgs(ModelArgs):
 
				 
			
 
				     self.shard = Shard(**self.shard)
			
 
				 
			
 
				+
			
 
				 class Qwen2Model(nn.Module):
			
 
				   def __init__(self, args: ModelArgs):
			
 
				     super().__init__()
			
@@ -57,7 +58,7 @@ class Qwen2Model(nn.Module):
 
				       mask = create_attention_mask(h, cache)
			
 
				 
			
 
				     if cache is None:
			
 
				-      cache = [None] * len(self.layers)
			
 
				+      cache = [None]*len(self.layers)
			
 
				 
			
 
				     for layer, c in zip(self.layers, cache):
			
 
				       h = layer(h, mask, c)
			
--- a/exo/inference/mlx/sharded_inference_engine.py
+++ b/exo/inference/mlx/sharded_inference_engine.py
@@ -1,14 +1,35 @@
 
				 import numpy as np
			
 
				 import mlx.core as mx
			
 
				+import mlx.nn as nn
			
 
				 from ..inference_engine import InferenceEngine
			
 
				-from .sharded_model import StatefulShardedModel
			
 
				+from .stateful_model import StatefulModel
			
 
				 from .sharded_utils import load_shard, get_image_from_str
			
 
				 from ..shard import Shard
			
 
				-from typing import Optional
			
 
				+from typing import Dict, Optional, Tuple
			
 
				 from exo.download.shard_download import ShardDownloader
			
 
				 import asyncio
			
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				 from functools import partial
			
 
				+def sample_logits(
			
 
				+  logits: mx.array,
			
 
				+  temp: float = 0.0,
			
 
				+  top_p: float = 1.0,
			
 
				+  logit_bias: Optional[Dict[int, float]] = None
			
 
				+) -> Tuple[mx.array, float]:
			
 
				+  if logit_bias:
			
 
				+    indices = mx.array(list(logit_bias.keys()))
			
 
				+    values = mx.array(list(logit_bias.values()))
			
 
				+    logits[:, indices] += values
			
 
				+
			
 
				+  if temp == 0:
			
 
				+    token = mx.argmax(logits, axis=-1)
			
 
				+  else:
			
 
				+    if top_p > 0 and top_p < 1.0:
			
 
				+      token = top_p_sampling(logits, top_p, temp)
			
 
				+    else:
			
 
				+      token = mx.random.categorical(logits*(1/temp))
			
 
				+
			
 
				+  return token
			
 
				 
			
 
				 class MLXDynamicShardInferenceEngine(InferenceEngine):
			
 
				   def __init__(self, shard_downloader: ShardDownloader):
			
@@ -16,35 +37,39 @@ class MLXDynamicShardInferenceEngine(InferenceEngine):
 
				     self.shard_downloader = shard_downloader
			
 
				     self.executor = ThreadPoolExecutor(max_workers=1)
			
 
				 
			
 
				-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				+  async def sample(self, x, temp: float = 0.0, top_p: float = 1.0) -> np.ndarray:
			
 
				+    y = mx.array(x)
			
 
				+    logits = y[:, -1, :]
			
 
				+    out = np.array(sample_logits(logits, temp=temp, top_p=top_p), dtype=int)
			
 
				+    return out
			
 
				+
			
 
				+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
			
 
				     await self.ensure_shard(shard)
			
 
				-    loop = asyncio.get_running_loop()
			
 
				-    if image_str:
			
 
				-      image = await get_image_from_str(image_str)
			
 
				-      tokenize = partial(self.tokenizer, prompt, image, return_tensors="np")
			
 
				-      inputs = await loop.run_in_executor(self.executor, tokenize)
			
 
				-      pixel_values = mx.array(inputs["pixel_values"])
			
 
				-      input_ids = mx.array(inputs["input_ids"])
			
 
				-      output_data: np.ndarray = np.array(await loop.run_in_executor(self.executor, self.stateful_sharded_model.step, request_id, input_ids, pixel_values))
			
 
				-    else:
			
 
				-      input_ids = mx.array(await loop.run_in_executor(self.executor, self.tokenizer.encode, prompt))
			
 
				-      output_data: np.ndarray = np.array(await loop.run_in_executor(self.executor, self.stateful_sharded_model.step, request_id, input_ids))
			
 
				-    return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
			
 
				+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
			
 
				+    return np.array(tokens)
			
 
				 
			
 
				-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				+  async def decode(self, shard: Shard, tokens) -> str:
			
 
				+    await self.ensure_shard(shard)
			
 
				+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
			
 
				+    return tokens
			
 
				+    
			
 
				+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				     await self.ensure_shard(shard)
			
 
				-    output_data: np.ndarray = np.array(await asyncio.get_running_loop().run_in_executor(self.executor, self.stateful_sharded_model.step, request_id, mx.array(input_data)))
			
 
				-    return output_data, "", output_data.size == 1 and output_data.item() == self.tokenizer.eos_token_id
			
 
				+    output_data: np.ndarray = np.array(await asyncio.get_running_loop().run_in_executor(self.executor, self.model, mx.array(input_data), request_id))
			
 
				+    return output_data
			
 
				 
			
 
				   async def ensure_shard(self, shard: Shard):
			
 
				     if self.shard == shard:
			
 
				       return
			
 
				 
			
 
				-    model_path = await self.shard_downloader.ensure_shard(shard)
			
 
				+    model_path = await self.shard_downloader.ensure_shard(shard, self.__class__.__name__)
			
 
				 
			
 
				     if self.shard != shard:
			
 
				       loop = asyncio.get_running_loop()
			
 
				-      def load_shard_wrapper(): return asyncio.run(load_shard(model_path, shard))
			
 
				+
			
 
				+      def load_shard_wrapper():
			
 
				+        return asyncio.run(load_shard(model_path, shard))
			
 
				+
			
 
				       model_shard, self.tokenizer = await loop.run_in_executor(self.executor, load_shard_wrapper)
			
 
				-      self.stateful_sharded_model = await loop.run_in_executor(self.executor, StatefulShardedModel, shard, model_shard)
			
 
				       self.shard = shard
			
 
				+      self.model = await loop.run_in_executor(self.executor, StatefulModel, model_shard) 
			
--- a/exo/inference/mlx/sharded_model.py
+++ b/exo/inference/mlx/sharded_model.py
@@ -1,86 +0,0 @@
 
				-from typing import Dict, Generator, Optional, Tuple
			
 
				-from collections import OrderedDict
			
 
				-
			
 
				-import mlx.core as mx
			
 
				-import mlx.nn as nn
			
 
				-from mlx_lm.models.base import KVCache, RotatingKVCache
			
 
				-from mlx_lm.sample_utils import top_p_sampling
			
 
				-
			
 
				-from ..shard import Shard
			
 
				-
			
 
				-# TODO: support a speculative model so we can parallelise compute across devices
			
 
				-class StatefulShardedModel:
			
 
				-  def __init__(self, shard: Shard, model: nn.Module, max_kv_size: int = 1024, max_caches: int = 2):
			
 
				-    self.shard = shard
			
 
				-    self.model = model
			
 
				-    self.max_kv_size = max_kv_size
			
 
				-    self.max_caches = max_caches
			
 
				-    self.caches = OrderedDict()
			
 
				-
			
 
				-  def step(
			
 
				-    self,
			
 
				-    request_id: str,
			
 
				-    x,
			
 
				-    pixel_values=None,
			
 
				-    temp: float = 0.0,
			
 
				-    top_p: float = 1.0,
			
 
				-    logit_bias: Optional[Dict[int, float]] = None,
			
 
				-  ) -> Generator[Tuple[mx.array, mx.array], None, None]:
			
 
				-    def sample(logits: mx.array) -> Tuple[mx.array, float]:
			
 
				-      if logit_bias:
			
 
				-        indices = mx.array(list(logit_bias.keys()))
			
 
				-        values = mx.array(list(logit_bias.values()))
			
 
				-        logits[:, indices] += values
			
 
				-
			
 
				-      if temp == 0:
			
 
				-        token = mx.argmax(logits, axis=-1)
			
 
				-      else:
			
 
				-        if top_p > 0 and top_p < 1.0:
			
 
				-          token = top_p_sampling(logits, top_p, temp)
			
 
				-        else:
			
 
				-          token = mx.random.categorical(logits*(1/temp))
			
 
				-
			
 
				-      return token
			
 
				-
			
 
				-    y = x
			
 
				-
			
 
				-    if request_id not in self.caches:
			
 
				-      self.init_cache(request_id)
			
 
				-    else:
			
 
				-      self.caches.move_to_end(request_id)
			
 
				-
			
 
				-    cache = self.caches[request_id]
			
 
				-
			
 
				-    if pixel_values is None:
			
 
				-      output = self.model(y[None] if self.shard.is_first_layer() else y, cache=cache)
			
 
				-    else:
			
 
				-      output = self.model(y, pixel_values=pixel_values, cache=cache)
			
 
				-
			
 
				-    if self.shard.is_last_layer():
			
 
				-      logits = output[:, -1, :]
			
 
				-      y = sample(logits)
			
 
				-      return y
			
 
				-    else:
			
 
				-      return output
			
 
				-
			
 
				-  def __call__(
			
 
				-    self,
			
 
				-    request_id: str,
			
 
				-    x,
			
 
				-    temp: float = 0.0,
			
 
				-    top_p: float = 1.0,
			
 
				-    logit_bias: Optional[Dict[int, float]] = None,
			
 
				-  ) -> Generator[Tuple[mx.array, mx.array], None, None]:
			
 
				-    return self.step(request_id, x, temp=temp, top_p=top_p, logit_bias=logit_bias)
			
 
				-
			
 
				-  def init_cache(self, request_id: str):
			
 
				-    kv_heads = ([self.model.n_kv_heads]*len(self.model.layers) if isinstance(self.model.n_kv_heads, int) else self.model.n_kv_heads)
			
 
				-    if self.max_kv_size is not None:
			
 
				-      cache = [RotatingKVCache(self.model.head_dim, n, max_size=self.max_kv_size, keep=4) for n in kv_heads]
			
 
				-    else:
			
 
				-      cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
			
 
				-
			
 
				-    if len(self.caches) >= self.max_caches:
			
 
				-      self.caches.popitem(last=False)
			
 
				-
			
 
				-    self.caches[request_id] = cache
			
--- a/exo/inference/mlx/sharded_utils.py
+++ b/exo/inference/mlx/sharded_utils.py
@@ -12,15 +12,16 @@ from typing import Optional, Tuple, Union, List, Callable
 
				 from PIL import Image
			
 
				 from io import BytesIO
			
 
				 import base64
			
 
				+import traceback
			
 
				 
			
 
				 import mlx.core as mx
			
 
				 import mlx.nn as nn
			
 
				 from transformers import AutoProcessor
			
 
				 
			
 
				 from mlx_lm.tokenizer_utils import load_tokenizer, TokenizerWrapper
			
 
				-from mlx_lm.tuner.utils import apply_lora_layers
			
 
				 
			
 
				 from exo import DEBUG
			
 
				+from exo.inference.tokenizers import resolve_tokenizer
			
 
				 from ..shard import Shard
			
 
				 
			
 
				 
			
@@ -53,6 +54,7 @@ def _get_classes(config: dict):
 
				   except ImportError:
			
 
				     msg = f"Model type {model_type} not supported."
			
 
				     logging.error(msg)
			
 
				+    traceback.print_exc()
			
 
				     raise ValueError(msg)
			
 
				 
			
 
				   return arch.Model, arch.ModelArgs
			
@@ -67,7 +69,6 @@ def load_config(model_path: Path) -> dict:
 
				     raise
			
 
				   return config
			
 
				 
			
 
				-
			
 
				 def load_model_shard(
			
 
				   model_path: Path,
			
 
				   shard: Shard,
			
@@ -130,8 +131,17 @@ def load_model_shard(
 
				 
			
 
				   model_class, model_args_class = _get_classes(config=config)
			
 
				 
			
 
				+  class ShardedModel(model_class):
			
 
				+    def __init__(self, args):
			
 
				+      super().__init__(args)
			
 
				+      self.shard = Shard(args.shard.model_id, args.shard.start_layer, args.shard.end_layer, args.shard.n_layers)
			
 
				+
			
 
				+    def __call__(self, x, *args, **kwargs):
			
 
				+      y = super().__call__(x, *args, **kwargs)
			
 
				+      return y
			
 
				+
			
 
				   model_args = model_args_class.from_dict(config)
			
 
				-  model = model_class(model_args)
			
 
				+  model = ShardedModel(model_args)
			
 
				 
			
 
				   if hasattr(model, "sanitize"):
			
 
				     weights = model.sanitize(weights)
			
@@ -157,7 +167,6 @@ def load_model_shard(
 
				   model.eval()
			
 
				   return model
			
 
				 
			
 
				-
			
 
				 async def load_shard(
			
 
				   model_path: str,
			
 
				   shard: Shard,
			
@@ -167,9 +176,6 @@ async def load_shard(
 
				   lazy: bool = False,
			
 
				 ) -> Tuple[nn.Module, TokenizerWrapper]:
			
 
				   model = load_model_shard(model_path, shard, lazy, model_config)
			
 
				-  if adapter_path is not None:
			
 
				-    model = apply_lora_layers(model, adapter_path)
			
 
				-    model.eval()
			
 
				 
			
 
				   # TODO: figure out a generic solution
			
 
				   if model.model_type == "llava":
			
@@ -178,7 +184,7 @@ async def load_shard(
 
				     processor.encode = processor.tokenizer.encode
			
 
				     return model, processor
			
 
				   else:
			
 
				-    tokenizer = load_tokenizer(model_path, tokenizer_config)
			
 
				+    tokenizer = await resolve_tokenizer(model_path)
			
 
				     return model, tokenizer
			
 
				 
			
 
				 
			
--- a/exo/inference/mlx/stateful_model.py
+++ b/exo/inference/mlx/stateful_model.py
@@ -0,0 +1,42 @@
 
				+from typing import Dict, Tuple
			
 
				+from collections import OrderedDict
			
 
				+
			
 
				+import mlx.core as mx
			
 
				+import mlx.nn as nn
			
 
				+from mlx_lm.models.cache import make_prompt_cache
			
 
				+
			
 
				+from ..shard import Shard
			
 
				+
			
 
				+class StatefulModel(nn.Module):
			
 
				+  def __init__(self, model, max_kv_size: int = 1024, max_caches: int = 2):
			
 
				+    super().__init__()
			
 
				+    self.model = model
			
 
				+    self.max_kv_size = max_kv_size
			
 
				+    self.max_caches = max_caches
			
 
				+    self.caches = OrderedDict()
			
 
				+  
			
 
				+  def init_cache(self, request_id: str):
			
 
				+    kv_heads = ([self.model.n_kv_heads]*len(self.model.layers) if isinstance(self.model.n_kv_heads, int) else self.model.n_kv_heads)
			
 
				+    # if self.max_kv_size is not None:
			
 
				+      # cache = [RotatingKVCache(self.model.head_dim, n, max_size=self.max_kv_size, keep=4) for n in kv_heads]
			
 
				+      # cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
			
 
				+    # else:
			
 
				+      # cache = [KVCache(self.model.head_dim, n) for n in kv_heads]
			
 
				+    cache = make_prompt_cache(self.model)
			
 
				+
			
 
				+    if len(self.caches) >= self.max_caches:
			
 
				+      self.caches.popitem(last=False)
			
 
				+
			
 
				+    self.caches[request_id] = cache
			
 
				+
			
 
				+  def __call__(self, x, request_id: str):
			
 
				+    if request_id not in self.caches:
			
 
				+      self.init_cache(request_id)
			
 
				+    else:
			
 
				+      self.caches.move_to_end(request_id)
			
 
				+
			
 
				+    cache = self.caches[request_id]
			
 
				+
			
 
				+    y = self.model(x, cache=cache)
			
 
				+    return y
			
 
				+    
			
--- a/exo/inference/mlx/test_sharded_llama.py
+++ b/exo/inference/mlx/test_sharded_llama.py
@@ -1,5 +1,5 @@
 
				 import mlx.core as mx
			
 
				-from exo.inference.mlx.sharded_model import StatefulShardedModel
			
 
				+from exo.inference.mlx.stateful_model import StatefulModel
			
 
				 from exo.inference.mlx.sharded_utils import load_shard
			
 
				 from exo.inference.shard import Shard
			
 
				 
			
@@ -12,9 +12,9 @@ full_model_shard, full_tokenizer = load_shard("mlx-community/Meta-Llama-3-8B-Ins
 
				 model_shard1, tokenizer1 = load_shard("mlx-community/Meta-Llama-3-8B-Instruct-4bit", shard=shard1)
			
 
				 model_shard2, tokenizer2 = load_shard("mlx-community/Meta-Llama-3-8B-Instruct-4bit", shard=shard2)
			
 
				 
			
 
				-full = StatefulShardedModel(shard_full, full_model_shard)
			
 
				-m1 = StatefulShardedModel(shard1, model_shard1)
			
 
				-m2 = StatefulShardedModel(shard2, model_shard2)
			
 
				+full = StatefulModel(shard_full, full_model_shard)
			
 
				+m1 = StatefulModel(shard1, model_shard1)
			
 
				+m2 = StatefulModel(shard2, model_shard2)
			
 
				 
			
 
				 prompt = "write a beautiful haiku about a utopia where people own their AI with edge intelligence:"
			
 
				 prompt_tokens = mx.array(full_tokenizer.encode(prompt))
			
--- a/exo/inference/mlx/test_sharded_llava.py
+++ b/exo/inference/mlx/test_sharded_llava.py
@@ -5,9 +5,9 @@ from PIL import Image
 
				 from io import BytesIO
			
 
				 
			
 
				 import mlx.core as mx
			
 
				-from mlx_lm.models.base import KVCache
			
 
				+from mlx_lm.models.cache import KVCache
			
 
				 
			
 
				-from exo.inference.mlx.sharded_model import StatefulShardedModel
			
 
				+from exo.inference.mlx.stateful_model import StatefulModel
			
 
				 from exo.inference.mlx.sharded_utils import load_shard
			
 
				 from exo.inference.shard import Shard
			
 
				 
			
--- a/exo/inference/test_dummy_inference_engine.py
+++ b/exo/inference/test_dummy_inference_engine.py
@@ -4,53 +4,50 @@ import numpy as np
 
				 from exo.inference.dummy_inference_engine import DummyInferenceEngine
			
 
				 from exo.inference.shard import Shard
			
 
				 
			
 
				+
			
 
				 class MockShardDownloader:
			
 
				-    async def ensure_shard(self, shard):
			
 
				-        pass
			
 
				+  async def ensure_shard(self, shard):
			
 
				+    pass
			
 
				+
			
 
				+
			
 
				 @pytest.mark.asyncio
			
 
				 async def test_dummy_inference_specific():
			
 
				-    engine = DummyInferenceEngine(MockShardDownloader())
			
 
				-    test_shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
			
 
				-    test_prompt = "This is a test prompt"
			
 
				-    
			
 
				-    result, state, is_finished = await engine.infer_prompt("test_request", test_shard, test_prompt)
			
 
				-    
			
 
				-    print(f"Inference result shape: {result.shape}")
			
 
				-    print(f"Inference state: {state}")
			
 
				-    print(f"Is finished: {is_finished}")
			
 
				-    
			
 
				-    assert result.shape[0] == 1, "Result should be a 2D array with first dimension 1"
			
 
				-    assert isinstance(json.loads(state), dict), "State should be a valid JSON string"
			
 
				-    assert isinstance(is_finished, bool), "is_finished should be a boolean"
			
 
				+  engine = DummyInferenceEngine(MockShardDownloader())
			
 
				+  test_shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
			
 
				+  test_prompt = "This is a test prompt"
			
 
				+
			
 
				+  result = await engine.infer_prompt("test_request", test_shard, test_prompt)
			
 
				+
			
 
				+  print(f"Inference result shape: {result.shape}")
			
 
				+
			
 
				+  assert result.shape[0] == 1, "Result should be a 2D array with first dimension 1"
			
 
				+
			
 
				 
			
 
				 @pytest.mark.asyncio
			
 
				 async def test_dummy_inference_engine():
			
 
				-    # Initialize the DummyInferenceEngine
			
 
				-    engine = DummyInferenceEngine(MockShardDownloader())
			
 
				-    
			
 
				-    # Create a test shard
			
 
				-    shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
			
 
				-    
			
 
				-    # Test infer_prompt
			
 
				-    output, state, is_finished = await engine.infer_prompt("test_id", shard, "Test prompt")
			
 
				-    
			
 
				-    assert isinstance(output, np.ndarray), "Output should be a numpy array"
			
 
				-    assert output.ndim == 2, "Output should be 2-dimensional"
			
 
				-    assert isinstance(state, str), "State should be a string"
			
 
				-    assert isinstance(is_finished, bool), "is_finished should be a boolean"
			
 
				-
			
 
				-    # Test infer_tensor
			
 
				-    input_tensor = np.array([[1, 2, 3]])
			
 
				-    output, state, is_finished = await engine.infer_tensor("test_id", shard, input_tensor)
			
 
				-    
			
 
				-    assert isinstance(output, np.ndarray), "Output should be a numpy array"
			
 
				-    assert output.ndim == 2, "Output should be 2-dimensional"
			
 
				-    assert isinstance(state, str), "State should be a string"
			
 
				-    assert isinstance(is_finished, bool), "is_finished should be a boolean"
			
 
				-
			
 
				-    print("All tests passed!")
			
 
				+  # Initialize the DummyInferenceEngine
			
 
				+  engine = DummyInferenceEngine(MockShardDownloader())
			
 
				+
			
 
				+  # Create a test shard
			
 
				+  shard = Shard(model_id="test_model", start_layer=0, end_layer=1, n_layers=1)
			
 
				+
			
 
				+  # Test infer_prompt
			
 
				+  output = await engine.infer_prompt("test_id", shard, "Test prompt")
			
 
				+
			
 
				+  assert isinstance(output, np.ndarray), "Output should be a numpy array"
			
 
				+  assert output.ndim == 2, "Output should be 2-dimensional"
			
 
				+
			
 
				+  # Test infer_tensor
			
 
				+  input_tensor = np.array([[1, 2, 3]])
			
 
				+  output = await engine.infer_tensor("test_id", shard, input_tensor)
			
 
				+
			
 
				+  assert isinstance(output, np.ndarray), "Output should be a numpy array"
			
 
				+  assert output.ndim == 2, "Output should be 2-dimensional"
			
 
				+
			
 
				+  print("All tests passed!")
			
 
				+
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    import asyncio
			
 
				-    asyncio.run(test_dummy_inference_engine())
			
 
				-    asyncio.run(test_dummy_inference_specific())
			
 
				+  import asyncio
			
 
				+  asyncio.run(test_dummy_inference_engine())
			
 
				+  asyncio.run(test_dummy_inference_specific())
			
--- a/exo/inference/test_inference_engine.py
+++ b/exo/inference/test_inference_engine.py
@@ -11,45 +11,40 @@ import numpy as np
 
				 # An inference engine should work the same for any number of Shards, as long as the Shards are continuous.
			
 
				 async def test_inference_engine(inference_engine_1: InferenceEngine, inference_engine_2: InferenceEngine, model_id: str, n_layers: int):
			
 
				   prompt = "In a single word only, what is the last name of the current president of the USA?"
			
 
				-  resp_full, inference_state_full, _ = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=n_layers - 1, n_layers=n_layers), prompt=prompt)
			
 
				-  next_resp_full, _next_inference_state_full, _ = await inference_engine_1.infer_tensor(
			
 
				+  resp_full = await inference_engine_1.infer_prompt("A", shard=Shard(model_id=model_id, start_layer=0, end_layer=n_layers - 1, n_layers=n_layers), prompt=prompt)
			
 
				+  token_full = await inference_engine_1.sample(resp_full)
			
 
				+  token_full = token_full.reshape(1, -1)
			
 
				+  next_resp_full = await inference_engine_1.infer_tensor(
			
 
				     "A",
			
 
				     shard=Shard(model_id=model_id, start_layer=0, end_layer=n_layers - 1, n_layers=n_layers),
			
 
				-    input_data=resp_full,
			
 
				-    inference_state=inference_state_full,
			
 
				+    input_data=token_full,
			
 
				   )
			
 
				 
			
 
				   pp = n_layers // 2
			
 
				-  resp1, inference_state_1, _ = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=n_layers), prompt=prompt)
			
 
				-  resp2, inference_state_2, _ = await inference_engine_2.infer_tensor(
			
 
				+  resp1 = await inference_engine_1.infer_prompt("B", shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=n_layers), prompt=prompt)
			
 
				+  resp2 = await inference_engine_2.infer_tensor(
			
 
				     "B",
			
 
				     shard=Shard(model_id=model_id, start_layer=pp + 1, end_layer=n_layers - 1, n_layers=n_layers),
			
 
				     input_data=resp1,
			
 
				-    inference_state=inference_state_1,
			
 
				   )
			
 
				-  resp3, inference_state_3, _ = await inference_engine_1.infer_tensor(
			
 
				+  tokens2 = await inference_engine_1.sample(resp2)
			
 
				+  tokens2 = tokens2.reshape(1, -1)
			
 
				+  resp3 = await inference_engine_1.infer_tensor(
			
 
				     "B",
			
 
				     shard=Shard(model_id=model_id, start_layer=0, end_layer=pp, n_layers=n_layers),
			
 
				-    input_data=resp2,
			
 
				-    inference_state=inference_state_2,
			
 
				+    input_data=tokens2,
			
 
				   )
			
 
				-  resp4, _inference_state_4, _ = await inference_engine_2.infer_tensor(
			
 
				+  resp4 = await inference_engine_2.infer_tensor(
			
 
				     "B",
			
 
				     shard=Shard(model_id=model_id, start_layer=pp + 1, end_layer=n_layers - 1, n_layers=n_layers),
			
 
				     input_data=resp3,
			
 
				-    inference_state=inference_state_3,
			
 
				   )
			
 
				 
			
 
				   assert np.array_equal(resp_full, resp2)
			
 
				   assert np.array_equal(next_resp_full, resp4)
			
 
				 
			
 
				 
			
 
				-asyncio.run(test_inference_engine(
			
 
				-  MLXDynamicShardInferenceEngine(HFShardDownloader()),
			
 
				-  MLXDynamicShardInferenceEngine(HFShardDownloader()),
			
 
				-  "mlx-community/Llama-3.2-1B-Instruct-4bit",
			
 
				-  16
			
 
				-))
			
 
				+asyncio.run(test_inference_engine(MLXDynamicShardInferenceEngine(HFShardDownloader()), MLXDynamicShardInferenceEngine(HFShardDownloader()), "llama-3.2-1b", 16))
			
 
				 
			
 
				 if os.getenv("RUN_TINYGRAD", default="0") == "1":
			
 
				   import tinygrad
			
@@ -57,10 +52,5 @@ if os.getenv("RUN_TINYGRAD", default="0") == "1":
 
				   from exo.inference.tinygrad.inference import TinygradDynamicShardInferenceEngine
			
 
				   tinygrad.helpers.DEBUG.value = int(os.getenv("TINYGRAD_DEBUG", default="0"))
			
 
				   asyncio.run(
			
 
				-    test_inference_engine(
			
 
				-      TinygradDynamicShardInferenceEngine(HFShardDownloader()),
			
 
				-      TinygradDynamicShardInferenceEngine(HFShardDownloader()),
			
 
				-      "TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R",
			
 
				-      32
			
 
				-    )
			
 
				+    test_inference_engine(TinygradDynamicShardInferenceEngine(HFShardDownloader()), TinygradDynamicShardInferenceEngine(HFShardDownloader()), "llama-3-8b", 32)
			
 
				   )
			
--- a/exo/inference/tinygrad/inference.py
+++ b/exo/inference/tinygrad/inference.py
@@ -1,17 +1,17 @@
 
				 from pathlib import Path
			
 
				 import json
			
 
				 import os
			
 
				-from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16
			
 
				+from exo.inference.tinygrad.models.llama import Transformer, convert_from_huggingface, fix_bf16, sample_logits
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.tokenizers import resolve_tokenizer
			
 
				 from tinygrad.nn.state import load_state_dict
			
 
				 from tinygrad import Tensor, nn, Context
			
 
				 from exo.inference.inference_engine import InferenceEngine
			
 
				-from typing import Optional, Tuple
			
 
				 import numpy as np
			
 
				 from exo.inference.tinygrad.tinygrad_helpers import concat_weights, load
			
 
				 from exo.download.shard_download import ShardDownloader
			
 
				 from concurrent.futures import ThreadPoolExecutor
			
 
				+from .stateful_model import StatefulModel
			
 
				 import asyncio
			
 
				 
			
 
				 Tensor.no_grad = True
			
@@ -22,7 +22,17 @@ TOP_P = 0.9
 
				 ALPHA_F = 0.1
			
 
				 ALPHA_P = 0.0
			
 
				 MODEL_PARAMS = {
			
 
				-  "8B": {"args": {"dim": 4096, "n_heads": 32, "n_kv_heads": 8, "n_layers": 32, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 14336}, "files": 1},
			
 
				+  "1B": {
			
 
				+    "args": {
			
 
				+      "dim": 2048, "n_heads": 32, "n_kv_heads": 8, "n_layers": 16, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 8192,
			
 
				+      "rope_scaling": {"factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3"}, "tie_word_embeddings": True
			
 
				+    }, "files": 1
			
 
				+  }, "3B": {
			
 
				+    "args": {
			
 
				+      "dim": 3072, "n_heads": 24, "n_kv_heads": 8, "n_layers": 28, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 8192,
			
 
				+      "rope_scaling": {"factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3"}, "tie_word_embeddings": True
			
 
				+    }, "files": 1
			
 
				+  }, "8B": {"args": {"dim": 4096, "n_heads": 32, "n_kv_heads": 8, "n_layers": 32, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 14336}, "files": 1},
			
 
				   "70B": {"args": {"dim": 8192, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-5, "rope_theta": 500000, "vocab_size": 128256, "hidden_dim": 28672}, "files": 8}
			
 
				 }
			
 
				 
			
@@ -30,8 +40,7 @@ MODEL_PARAMS = {
 
				 def build_transformer(model_path: Path, shard: Shard, model_size="8B", device=None):
			
 
				   # build model
			
 
				   linear = nn.Linear
			
 
				-  with Context(THREEFRY=0):
			
 
				-    model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, max_context=8192, jit=True, shard=shard)
			
 
				+  model = Transformer(**MODEL_PARAMS[model_size]["args"], linear=linear, max_context=8192, jit=True, shard=shard)
			
 
				 
			
 
				   # load weights
			
 
				   if model_path.is_dir():
			
@@ -48,54 +57,43 @@ def build_transformer(model_path: Path, shard: Shard, model_size="8B", device=No
 
				     load_state_dict(model, weights, strict=False, consume=False)  # consume=True
			
 
				   return model
			
 
				 
			
 
				-
			
 
				 class TinygradDynamicShardInferenceEngine(InferenceEngine):
			
 
				   def __init__(self, shard_downloader: ShardDownloader):
			
 
				     self.shard = None
			
 
				     self.shard_downloader = shard_downloader
			
 
				     self.executor = ThreadPoolExecutor(max_workers=1)
			
 
				 
			
 
				-  async def infer_prompt(self, request_id: str, shard: Shard, prompt: str, image_str: Optional[str] = None, inference_state: Optional[str] = None) -> (np.ndarray, str, bool):
			
 
				-    await self.ensure_shard(shard)
			
 
				-    start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
			
 
				-    n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
			
 
				-
			
 
				-    toks = await asyncio.get_event_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
			
 
				-    h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor([toks]), start_pos, TEMPERATURE).realize())
			
 
				-
			
 
				-    if h.shape == (1,):
			
 
				-      start_pos += len(toks)
			
 
				-      start_pos += 1
			
 
				-      n_captured_toks = 0
			
 
				-      return np.array([[h.item()]]), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), h.item() == self.tokenizer.eos_token_id
			
 
				-    else:
			
 
				-      n_captured_toks = len(toks)
			
 
				-      return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
			
 
				+  async def sample(self, x: np.ndarray, temp=TEMPERATURE, top_p: float = 0.0) -> np.ndarray:
			
 
				+    logits = x[:, -1, :]
			
 
				+    def sample_wrapper():
			
 
				+      return sample_logits(Tensor(logits).flatten(), temp, 0, 0.8, top_p, 0.0).realize().numpy().astype(int)
			
 
				+    return await asyncio.get_running_loop().run_in_executor(self.executor, sample_wrapper)
			
 
				 
			
 
				-  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray, inference_state: Optional[str] = None) -> Tuple[np.ndarray, str, bool]:
			
 
				+  async def encode(self, shard: Shard, prompt: str) -> np.ndarray:
			
 
				     await self.ensure_shard(shard)
			
 
				-    start_pos = json.loads(inference_state or "{}").get("start_pos", 0)
			
 
				-    n_captured_toks = json.loads(inference_state or "{}").get("n_captured_toks", 0)
			
 
				-
			
 
				-    h = await asyncio.get_event_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), start_pos, TEMPERATURE).realize())
			
 
				+    tokens = await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.encode, prompt)
			
 
				+    return await asyncio.get_running_loop().run_in_executor(self.executor, np.array, tokens)
			
 
				+  
			
 
				+  async def decode(self, shard: Shard, tokens) -> str:
			
 
				+    await self.ensure_shard(shard)
			
 
				+    return await asyncio.get_running_loop().run_in_executor(self.executor, self.tokenizer.decode, tokens)
			
 
				 
			
 
				-    if h.shape == (1,):
			
 
				-      start_pos += n_captured_toks
			
 
				-      start_pos += 1
			
 
				-      n_captured_toks = 0
			
 
				-      return np.array([[h.item()]]), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), h.item() == self.tokenizer.eos_token_id
			
 
				-    else:
			
 
				-      return h.numpy(), json.dumps({"start_pos": start_pos, "n_captured_toks": n_captured_toks}), False
			
 
				+  async def infer_tensor(self, request_id: str, shard: Shard, input_data: np.ndarray) -> np.ndarray:
			
 
				+    await self.ensure_shard(shard)
			
 
				+    return await asyncio.get_running_loop().run_in_executor(self.executor, lambda: self.model(Tensor(input_data), request_id).realize().numpy())
			
 
				 
			
 
				   async def ensure_shard(self, shard: Shard):
			
 
				     if self.shard == shard:
			
 
				       return
			
 
				 
			
 
				-    model_path = await self.shard_downloader.ensure_shard(shard)
			
 
				+    model_path = await self.shard_downloader.ensure_shard(shard, self.__class__.__name__)
			
 
				 
			
 
				     if self.shard != shard:
			
 
				-      self.model = await asyncio.get_event_loop().run_in_executor(self.executor, build_transformer, model_path, shard, "8B" if "8b" in shard.model_id.lower() else "70B")
			
 
				+      loop = asyncio.get_running_loop()
			
 
				+      parameters = "1B" if "1b" in shard.model_id.lower() else "3B" if "3b" in shard.model_id.lower() else "8B" if "8b" in shard.model_id.lower() else "70B"
			
 
				+      model_shard = await loop.run_in_executor(self.executor, build_transformer, model_path, shard, parameters)
			
 
				 
			
 
				       tokenizer_path = str((model_path if model_path.is_dir() else model_path.parent))
			
 
				       self.tokenizer = await resolve_tokenizer(tokenizer_path)
			
 
				       self.shard = shard
			
 
				+      self.model = await loop.run_in_executor(self.executor, StatefulModel, model_shard) 
			
--- a/exo/inference/tinygrad/models/llama.py
+++ b/exo/inference/tinygrad/models/llama.py
@@ -1,11 +1,23 @@
 
				-from typing import Tuple, Union, Optional, Dict, Any
			
 
				+from typing import Tuple, Union, Optional, Dict, Any, List
			
 
				 from tinygrad import Tensor, Variable, TinyJit, dtypes, nn, Device
			
 
				 from tinygrad.helpers import getenv
			
 
				+from collections import OrderedDict
			
 
				 
			
 
				 
			
 
				 # https://github.com/facebookresearch/llama/blob/1076b9c51c77ad06e9d7ba8a4c6df775741732bd/llama/model.py#L47
			
 
				-def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, dtype=dtypes.half) -> Tensor:
			
 
				+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0, dtype=dtypes.half, rope_scaling: Optional[Dict[str, float]] = None) -> Tensor:
			
 
				   freqs = 1.0/(theta**(Tensor.arange(0, dim, 2)[:(dim // 2)]/dim))
			
 
				+
			
 
				+  if rope_scaling:
			
 
				+    factor = rope_scaling.get('factor', 1.0)
			
 
				+    low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
			
 
				+    high_freq_factor = rope_scaling.get('high_freq_factor', 1.0)
			
 
				+    original_max_pos_emb = rope_scaling.get('original_max_position_embeddings', end)
			
 
				+
			
 
				+    freqs[:dim // 4] *= low_freq_factor
			
 
				+    freqs[dim // 4:] = freqs[dim // 4:].contiguous()*high_freq_factor
			
 
				+    freqs *= (original_max_pos_emb/end)**(1.0/factor)
			
 
				+
			
 
				   freqs = Tensor.arange(end).unsqueeze(dim=1)*freqs.unsqueeze(dim=0)
			
 
				   # TODO: move dtype outside this
			
 
				   return Tensor.stack(freqs.cos().cast(dtype), freqs.sin().cast(dtype), dim=-1).reshape(1, end, 1, dim // 2, 2)
			
@@ -36,7 +48,6 @@ def repeat_kv(x: Tensor, n_rep: int) -> Tensor:
 
				   # NOTE: this is different from x.repeat((1, 1, n_rep, 1))
			
 
				   return x.repeat((1, 1, 1, n_rep)).reshape(bs, seqlen, n_kv_heads*n_rep, head_dim)
			
 
				 
			
 
				-
			
 
				 class Attention:
			
 
				   def __init__(self, dim, n_heads, n_kv_heads, max_context, linear=nn.Linear):
			
 
				     self.n_heads = n_heads
			
@@ -50,7 +61,7 @@ class Attention:
 
				     self.wv = linear(dim, self.n_kv_heads*self.head_dim, bias=False)
			
 
				     self.wo = linear(self.n_heads*self.head_dim, dim, bias=False)
			
 
				 
			
 
				-  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]) -> Tensor:
			
 
				+  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor], cache: Optional[Tensor]=None) -> Tensor:
			
 
				     if getenv("WQKV"):
			
 
				       if not hasattr(self, 'wqkv'): self.wqkv = Tensor.cat(self.wq.weight, self.wk.weight, self.wv.weight)
			
 
				       xqkv = x @ self.wqkv.T
			
@@ -65,19 +76,16 @@ class Attention:
 
				     xq, xk = apply_rotary_emb(xq, xk, freqs_cis)
			
 
				     bsz, seqlen, _, _ = xq.shape
			
 
				 
			
 
				-    # create kv cache
			
 
				-    if not hasattr(self, "cache_kv"):
			
 
				-      self.cache_kv = Tensor.zeros(2, bsz, self.max_context, self.n_kv_heads, self.head_dim, dtype=x.dtype).contiguous().realize()
			
 
				-      if isinstance(x.device, tuple):
			
 
				-        # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
			
 
				-        self.cache_kv.shard_((x.device), axis=3 if getenv("SHARD_KVCACHE") else None).realize()
			
 
				+    if cache is not None:
			
 
				+      # update the cache
			
 
				+      assert xk.dtype == xv.dtype == cache.dtype, f"{xk.dtype=}, {xv.dtype=}, {cache.dtype=}"
			
 
				+      cache.shrink((None, None, (start_pos, start_pos + seqlen), None, None)).assign(Tensor.stack(xk, xv)).realize()
			
 
				 
			
 
				-    # update the cache
			
 
				-    assert xk.dtype == xv.dtype == self.cache_kv.dtype, f"{xk.dtype=}, {xv.dtype=}, {self.cache_kv.dtype=}"
			
 
				-    self.cache_kv.shrink((None, None, (start_pos, start_pos + seqlen), None, None)).assign(Tensor.stack(xk, xv)).realize()
			
 
				-
			
 
				-    keys = self.cache_kv[0].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xk
			
 
				-    values = self.cache_kv[1].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xv
			
 
				+      keys = cache[0].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xk
			
 
				+      values = cache[1].shrink((None, (0, start_pos + seqlen), None, None)) if start_pos > 0 else xv
			
 
				+    else:
			
 
				+      keys = xk
			
 
				+      values = xv
			
 
				 
			
 
				     keys, values = repeat_kv(keys, self.n_rep), repeat_kv(values, self.n_rep)
			
 
				     xq, keys, values = xq.transpose(1, 2), keys.transpose(1, 2), values.transpose(1, 2)
			
@@ -103,13 +111,13 @@ class TransformerBlock:
 
				     self.attention_norm = nn.RMSNorm(dim, norm_eps)
			
 
				     self.ffn_norm = nn.RMSNorm(dim, norm_eps)
			
 
				 
			
 
				-  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor]):
			
 
				-    h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask)
			
 
				+  def __call__(self, x: Tensor, start_pos: Union[Variable, int], freqs_cis: Tensor, mask: Optional[Tensor], cache: Optional[Tensor]=None):
			
 
				+    h = x + self.attention(self.attention_norm(x), start_pos, freqs_cis, mask, cache=cache)
			
 
				     return (h + self.feed_forward(self.ffn_norm(h))).contiguous()
			
 
				 
			
 
				 
			
 
				 # standard openai sampling
			
 
				-def sample(logits: Tensor, temp: float, k: int, p: float, af: float, ap: float):
			
 
				+def sample_logits(logits: Tensor, temp: float, k: int, p: float, af: float, ap: float):
			
 
				   assert logits.ndim == 1, "only works on 1d tensors"
			
 
				   assert 0 <= p <= 1, "p must be between 0 and 1"
			
 
				   assert 0 <= k <= logits.numel(), "k must be between 0 and numel"
			
@@ -176,42 +184,56 @@ class Transformer:
 
				     rope_theta=10000,
			
 
				     max_context=1024,
			
 
				     jit=True,
			
 
				-    feed_forward=FeedForward
			
 
				+    feed_forward=FeedForward,
			
 
				+    rope_scaling: Optional[Dict[str, float]] = None,
			
 
				+    tie_word_embeddings=False,
			
 
				   ):
			
 
				     self.layers = [TransformerBlock(dim, hidden_dim, n_heads, n_kv_heads, norm_eps, max_context, linear, feed_forward=feed_forward) for _ in range(n_layers)]
			
 
				     self.norm = nn.RMSNorm(dim, norm_eps)
			
 
				     self.tok_embeddings = nn.Embedding(vocab_size, dim)
			
 
				     self.output = nn.Linear(dim, vocab_size, bias=False)
			
 
				+    if tie_word_embeddings:
			
 
				+      self.output.weight = self.tok_embeddings.weight
			
 
				     self.max_context = max_context
			
 
				-    self.freqs_cis = precompute_freqs_cis(dim // n_heads, self.max_context*2, rope_theta).contiguous()
			
 
				-    self.forward_jit = TinyJit(self.forward) if jit else None
			
 
				+    self.freqs_cis = precompute_freqs_cis(dim // n_heads, self.max_context*2, rope_theta, rope_scaling=rope_scaling).contiguous()
			
 
				+    self.forward_jit = TinyJit(self.forward_base) if jit else None
			
 
				     self.shard = shard
			
 
				 
			
 
				-  def forward(self, x: Tensor, start_pos: Union[Variable, int], temperature: float, top_k: int, top_p: float, alpha_f: float, alpha_p: float):
			
 
				+  def forward_base(self, x: Tensor, start_pos: Union[Variable, int], cache: Optional[List[Tensor]] = None):
			
 
				     seqlen = x.shape[1]
			
 
				     freqs_cis = self.freqs_cis.shrink((None, (start_pos, start_pos + seqlen), None, None, None))
			
 
				     mask = Tensor.full((1, 1, seqlen, start_pos + seqlen), float("-100000000"), dtype=x.dtype, device=x.device).triu(start_pos + 1).realize() if seqlen > 1 else None
			
 
				 
			
 
				-    if self.shard.is_first_layer():
			
 
				-      h = self.tok_embeddings(x)
			
 
				-    else:
			
 
				-      h = x
			
 
				+    h = x
			
 
				 
			
 
				-    for i in range(self.shard.start_layer, self.shard.end_layer + 1):
			
 
				+    if cache is None:
			
 
				+      cache = [None for _ in range(self.shard.start_layer, self.shard.end_layer + 1)]  
			
 
				+    for i, c in zip(range(self.shard.start_layer, self.shard.end_layer + 1), cache):
			
 
				       layer = self.layers[i]
			
 
				-      h = layer(h, start_pos, freqs_cis, mask)
			
 
				+      h = layer(h, start_pos, freqs_cis, mask, cache=c)
			
 
				 
			
 
				     if self.shard.is_last_layer():
			
 
				-      logits = self.output(self.norm(h)).float()[:, -1, :]
			
 
				-      return sample(logits.flatten(), temperature, top_k, top_p, alpha_f, alpha_p).realize()
			
 
				+      logits = self.output(self.norm(h)).float().realize()
			
 
				+      return logits
			
 
				     else:
			
 
				       return h
			
 
				 
			
 
				-  def __call__(self, tokens: Tensor, start_pos: Variable, temperature: float = 0.0, top_k: int = 0, top_p: float = 0.8, alpha_f: float = 0.0, alpha_p: float = 0.0):
			
 
				+  def embed(self, inputs: Tensor):
			
 
				+    if self.shard.is_first_layer():
			
 
				+      h = self.tok_embeddings(inputs)
			
 
				+    else:
			
 
				+      h = inputs
			
 
				+    return h
			
 
				+
			
 
				+  def forward(self, x: Tensor, start_pos: int, cache: Optional[List[Tensor]] = None):
			
 
				+    if x.shape[0:2] == (1, 1) and self.forward_jit is not None and start_pos != 0:
			
 
				+      return self.forward_jit(x, Variable("start_pos", 1, self.max_context).bind(start_pos), cache=cache)
			
 
				+    return self.forward_base(x, start_pos, cache=cache)
			
 
				+
			
 
				+  def __call__(self, tokens: Tensor, start_pos: Variable, cache: Optional[List[Tensor]] = None):
			
 
				     # TODO: better way to handle the first call v.s. the rest?
			
 
				-    if tokens.shape[0:2] == (1, 1) and self.forward_jit is not None:
			
 
				-      return self.forward_jit(tokens, Variable("start_pos", 0, self.max_context).bind(start_pos), temperature, top_k, top_p, alpha_f, alpha_p)
			
 
				-    return self.forward(tokens, start_pos, temperature, top_k, top_p, alpha_f, alpha_p)
			
 
				+    h = self.embed(x)
			
 
				+    return self.forward(h, start_pos, cache=cache)
			
 
				 
			
 
				 
			
 
				 # *** helpers ***
			
@@ -245,7 +267,10 @@ def convert_from_huggingface(weights: Dict[str, Tensor], model: Transformer, n_h
 
				         v = permute(v, n_heads)
			
 
				       elif "k_proj" in k:
			
 
				         v = permute(v, n_kv_heads)
			
 
				-    sd[keymap[k]] = v
			
 
				+    if k in keymap:
			
 
				+      sd[keymap[k]] = v
			
 
				+    else:
			
 
				+      sd[k] = v
			
 
				   return sd
			
 
				 
			
 
				 
			
--- a/exo/inference/tinygrad/stateful_model.py
+++ b/exo/inference/tinygrad/stateful_model.py
@@ -0,0 +1,42 @@
 
				+from tinygrad import Tensor, Variable 
			
 
				+from collections import OrderedDict
			
 
				+from typing import List
			
 
				+
			
 
				+def create_kv_cache(x: Tensor, max_context: int, n_kv_heads: int, head_dim: int):
			
 
				+  cache_kv = Tensor.zeros(2, x.shape[0], max_context, n_kv_heads, head_dim, dtype=x.dtype).contiguous().realize()
			
 
				+  if isinstance(x.device, tuple):
			
 
				+    # TODO: instead of specifying how to shard, it can follow how xk and xv are being sharded
			
 
				+    cache_kv.shard_((x.device), axis=3 if getenv("SHARD_KVCACHE") else None).realize()
			
 
				+  return cache_kv.realize()
			
 
				+
			
 
				+class ModelState:
			
 
				+  cache: List[Tensor]
			
 
				+  start: int 
			
 
				+  def __init__(self, cache: List[Tensor], start: int = 0):
			
 
				+    self.cache = cache
			
 
				+    self.start = start
			
 
				+
			
 
				+class StatefulModel:
			
 
				+  def __init__(self, model, max_states: int = 2):
			
 
				+    super().__init__()
			
 
				+    self.model = model
			
 
				+    self.max_states = max_states
			
 
				+    self.states = OrderedDict()
			
 
				+ 
			
 
				+  def init_cache(self, x: Tensor, request_id: str):
			
 
				+    cache = [create_kv_cache(x, self.model.layers[i].attention.max_context, self.model.layers[i].attention.n_kv_heads, self.model.layers[i].attention.head_dim) for i in range(self.model.shard.start_layer, self.model.shard.end_layer + 1)]
			
 
				+    if len(self.states) >= self.max_states:
			
 
				+      self.states.popitem(last=False)
			
 
				+
			
 
				+    self.states[request_id] = ModelState(cache)
			
 
				+
			
 
				+  def __call__(self, x: Tensor, request_id: str): 
			
 
				+    h = self.model.embed(x)
			
 
				+    if request_id not in self.states:
			
 
				+      self.init_cache(h, request_id)
			
 
				+    else:
			
 
				+      self.states.move_to_end(request_id)
			
 
				+    out = self.model.forward(h, self.states[request_id].start, cache=self.states[request_id].cache)
			
 
				+    self.states[request_id].start += h.shape[1]
			
 
				+    return out
			
 
				+
			
--- a/exo/inference/tokenizers.py
+++ b/exo/inference/tokenizers.py
@@ -7,14 +7,18 @@ from transformers import AutoTokenizer, AutoProcessor
 
				 from exo.download.hf.hf_helpers import get_local_snapshot_dir
			
 
				 from exo.helpers import DEBUG
			
 
				 
			
 
				+
			
 
				 class DummyTokenizer:
			
 
				   def __init__(self):
			
 
				     self.eos_token_id = 0
			
 
				+
			
 
				   def apply_chat_template(self, messages, tokenize=True, add_generation_prompt=True):
			
 
				-    return [1,2,3]
			
 
				+    return [1, 2, 3]
			
 
				+
			
 
				   def decode(self, tokens):
			
 
				     return "dummy"
			
 
				 
			
 
				+
			
 
				 async def resolve_tokenizer(model_id: str):
			
 
				   if model_id == "dummy":
			
 
				     return DummyTokenizer()
			
@@ -29,6 +33,7 @@ async def resolve_tokenizer(model_id: str):
 
				     if DEBUG >= 5: traceback.print_exc()
			
 
				   return await _resolve_tokenizer(model_id)
			
 
				 
			
 
				+
			
 
				 async def _resolve_tokenizer(model_id_or_local_path: Union[str, PathLike]):
			
 
				   try:
			
 
				     if DEBUG >= 4: print(f"Trying AutoProcessor for {model_id_or_local_path}")
			
--- a/exo/main.py
+++ b/exo/main.py
@@ -1,8 +1,12 @@
 
				 import argparse
			
 
				 import asyncio
			
 
				+import atexit
			
 
				 import signal
			
 
				 import json
			
 
				 import logging
			
 
				+import platform
			
 
				+import os
			
 
				+import sys
			
 
				 import time
			
 
				 import traceback
			
 
				 import uuid
			
@@ -17,22 +21,24 @@ from exo.topology.ring_memory_weighted_partitioning_strategy import RingMemoryWe
 
				 from exo.api import ChatGPTAPI
			
 
				 from exo.download.shard_download import ShardDownloader, RepoProgressEvent, NoopShardDownloader
			
 
				 from exo.download.hf.hf_shard_download import HFShardDownloader
			
 
				-from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_system_info, get_or_create_node_id, get_all_ip_addresses, terminal_link
			
 
				+from exo.helpers import print_yellow_exo, find_available_port, DEBUG, get_system_info, get_or_create_node_id, get_all_ip_addresses, terminal_link, shutdown
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.inference.inference_engine import get_inference_engine, InferenceEngine
			
 
				-from exo.inference.dummy_inference_engine import DummyInferenceEngine
			
 
				 from exo.inference.tokenizers import resolve_tokenizer
			
 
				 from exo.orchestration.node import Node
			
 
				-from exo.models import model_base_shards
			
 
				+from exo.models import build_base_shard, get_repo
			
 
				 from exo.viz.topology_viz import TopologyViz
			
 
				+from exo.download.hf.hf_helpers import has_hf_home_read_access, has_hf_home_write_access, get_hf_home, move_models_to_hf
			
 
				 
			
 
				 # parse args
			
 
				 parser = argparse.ArgumentParser(description="Initialize GRPC Discovery")
			
 
				 parser.add_argument("command", nargs="?", choices=["run"], help="Command to run")
			
 
				 parser.add_argument("model_name", nargs="?", help="Model name to run")
			
 
				+parser.add_argument("--default-model", type=str, default=None, help="Default model")
			
 
				 parser.add_argument("--node-id", type=str, default=None, help="Node ID")
			
 
				 parser.add_argument("--node-host", type=str, default="0.0.0.0", help="Node host")
			
 
				 parser.add_argument("--node-port", type=int, default=None, help="Node port")
			
 
				+parser.add_argument("--models-seed-dir", type=str, default=None, help="Model seed directory")
			
 
				 parser.add_argument("--listen-port", type=int, default=5678, help="Listening port for discovery")
			
 
				 parser.add_argument("--download-quick-check", action="store_true", help="Quick check local path for model shards download")
			
 
				 parser.add_argument("--max-parallel-downloads", type=int, default=4, help="Max parallel downloads for model shards download")
			
@@ -42,7 +48,7 @@ parser.add_argument("--discovery-module", type=str, choices=["udp", "tailscale",
 
				 parser.add_argument("--discovery-timeout", type=int, default=30, help="Discovery timeout in seconds")
			
 
				 parser.add_argument("--discovery-config-path", type=str, default=None, help="Path to discovery config json file")
			
 
				 parser.add_argument("--wait-for-peers", type=int, default=0, help="Number of peers to wait to connect to before starting")
			
 
				-parser.add_argument("--chatgpt-api-port", type=int, default=8000, help="ChatGPT API port")
			
 
				+parser.add_argument("--chatgpt-api-port", type=int, default=52415, help="ChatGPT API port")
			
 
				 parser.add_argument("--chatgpt-api-response-timeout", type=int, default=900, help="ChatGPT API response timeout in seconds")
			
 
				 parser.add_argument("--max-generate-tokens", type=int, default=10000, help="Max tokens to generate in each request")
			
 
				 parser.add_argument("--inference-engine", type=str, default=None, help="Inference engine to use (mlx, tinygrad, or dummy)")
			
@@ -54,14 +60,13 @@ parser.add_argument("--tailnet-name", type=str, default=None, help="Tailnet name
 
				 args = parser.parse_args()
			
 
				 print(f"Selected inference engine: {args.inference_engine}")
			
 
				 
			
 
				-
			
 
				 print_yellow_exo()
			
 
				 
			
 
				-
			
 
				 system_info = get_system_info()
			
 
				 print(f"Detected system: {system_info}")
			
 
				 
			
 
				-shard_downloader: ShardDownloader = HFShardDownloader(quick_check=args.download_quick_check, max_parallel_downloads=args.max_parallel_downloads) if args.inference_engine != "dummy" else NoopShardDownloader()
			
 
				+shard_downloader: ShardDownloader = HFShardDownloader(quick_check=args.download_quick_check,
			
 
				+                                                      max_parallel_downloads=args.max_parallel_downloads) if args.inference_engine != "dummy" else NoopShardDownloader()
			
 
				 inference_engine_name = args.inference_engine or ("mlx" if system_info == "Apple Silicon Mac" else "tinygrad")
			
 
				 print(f"Inference engine name after selection: {inference_engine_name}")
			
 
				 
			
@@ -84,9 +89,23 @@ if DEBUG >= 0:
 
				     print(f" - {terminal_link(chatgpt_api_endpoint)}")
			
 
				 
			
 
				 if args.discovery_module == "udp":
			
 
				-  discovery = UDPDiscovery(args.node_id, args.node_port, args.listen_port, args.broadcast_port, lambda peer_id, address, device_capabilities: GRPCPeerHandle(peer_id, address, device_capabilities), discovery_timeout=args.discovery_timeout)
			
 
				+  discovery = UDPDiscovery(
			
 
				+    args.node_id,
			
 
				+    args.node_port,
			
 
				+    args.listen_port,
			
 
				+    args.broadcast_port,
			
 
				+    lambda peer_id, address, device_capabilities: GRPCPeerHandle(peer_id, address, device_capabilities),
			
 
				+    discovery_timeout=args.discovery_timeout
			
 
				+  )
			
 
				 elif args.discovery_module == "tailscale":
			
 
				-  discovery = TailscaleDiscovery(args.node_id, args.node_port, lambda peer_id, address, device_capabilities: GRPCPeerHandle(peer_id, address, device_capabilities), discovery_timeout=args.discovery_timeout, tailscale_api_key=args.tailscale_api_key, tailnet=args.tailnet_name)
			
 
				+  discovery = TailscaleDiscovery(
			
 
				+    args.node_id,
			
 
				+    args.node_port,
			
 
				+    lambda peer_id, address, device_capabilities: GRPCPeerHandle(peer_id, address, device_capabilities),
			
 
				+    discovery_timeout=args.discovery_timeout,
			
 
				+    tailscale_api_key=args.tailscale_api_key,
			
 
				+    tailnet=args.tailnet_name
			
 
				+  )
			
 
				 elif args.discovery_module == "manual":
			
 
				   if not args.discovery_config_path:
			
 
				     raise ValueError(f"--discovery-config-path is required when using manual discovery. Please provide a path to a config json file.")
			
@@ -108,22 +127,26 @@ api = ChatGPTAPI(
 
				   node,
			
 
				   inference_engine.__class__.__name__,
			
 
				   response_timeout=args.chatgpt_api_response_timeout,
			
 
				-  on_chat_completion_request=lambda req_id, __, prompt: topology_viz.update_prompt(req_id, prompt) if topology_viz else None
			
 
				+  on_chat_completion_request=lambda req_id, __, prompt: topology_viz.update_prompt(req_id, prompt) if topology_viz else None,
			
 
				+  default_model=args.default_model
			
 
				 )
			
 
				 node.on_token.register("update_topology_viz").on_next(
			
 
				   lambda req_id, tokens, __: topology_viz.update_prompt_output(req_id, inference_engine.tokenizer.decode(tokens)) if topology_viz and hasattr(inference_engine, "tokenizer") else None
			
 
				 )
			
 
				+
			
 
				 def preemptively_start_download(request_id: str, opaque_status: str):
			
 
				   try:
			
 
				     status = json.loads(opaque_status)
			
 
				     if status.get("type") == "node_status" and status.get("status") == "start_process_prompt":
			
 
				       current_shard = node.get_current_shard(Shard.from_dict(status.get("shard")))
			
 
				       if DEBUG >= 2: print(f"Preemptively starting download for {current_shard}")
			
 
				-      asyncio.create_task(shard_downloader.ensure_shard(current_shard))
			
 
				+      asyncio.create_task(shard_downloader.ensure_shard(current_shard, inference_engine.__class__.__name__))
			
 
				   except Exception as e:
			
 
				     if DEBUG >= 2:
			
 
				       print(f"Failed to preemptively start download: {e}")
			
 
				       traceback.print_exc()
			
 
				+
			
 
				+
			
 
				 node.on_opaque_status.register("start_download").on_next(preemptively_start_download)
			
 
				 
			
 
				 if args.prometheus_client_port:
			
@@ -132,38 +155,24 @@ if args.prometheus_client_port:
 
				 
			
 
				 last_broadcast_time = 0
			
 
				 
			
 
				-def throttled_broadcast(shard: Shard, event: RepoProgressEvent):
			
 
				-    global last_broadcast_time
			
 
				-    current_time = time.time()
			
 
				-    if event.status == "complete" or current_time - last_broadcast_time >= 0.1:
			
 
				-        last_broadcast_time = current_time
			
 
				-        asyncio.create_task(node.broadcast_opaque_status("", json.dumps({
			
 
				-            "type": "download_progress",
			
 
				-            "node_id": node.id,
			
 
				-            "progress": event.to_dict()
			
 
				-        })))
			
 
				 
			
 
				-shard_downloader.on_progress.register("broadcast").on_next(throttled_broadcast)
			
 
				+def throttled_broadcast(shard: Shard, event: RepoProgressEvent):
			
 
				+  global last_broadcast_time
			
 
				+  current_time = time.time()
			
 
				+  if event.status == "complete" or current_time - last_broadcast_time >= 0.1:
			
 
				+    last_broadcast_time = current_time
			
 
				+    asyncio.create_task(node.broadcast_opaque_status("", json.dumps({"type": "download_progress", "node_id": node.id, "progress": event.to_dict()})))
			
 
				 
			
 
				 
			
 
				-async def shutdown(signal, loop):
			
 
				-  """Gracefully shutdown the server and close the asyncio loop."""
			
 
				-  print(f"Received exit signal {signal.name}...")
			
 
				-  print("Thank you for using exo.")
			
 
				-  print_yellow_exo()
			
 
				-  server_tasks = [t for t in asyncio.all_tasks() if t is not asyncio.current_task()]
			
 
				-  [task.cancel() for task in server_tasks]
			
 
				-  print(f"Cancelling {len(server_tasks)} outstanding tasks")
			
 
				-  await asyncio.gather(*server_tasks, return_exceptions=True)
			
 
				-  await server.stop()
			
 
				-  loop.stop()
			
 
				+shard_downloader.on_progress.register("broadcast").on_next(throttled_broadcast)
			
 
				 
			
 
				 async def run_model_cli(node: Node, inference_engine: InferenceEngine, model_name: str, prompt: str):
			
 
				-  shard = model_base_shards.get(model_name, {}).get(inference_engine.__class__.__name__)
			
 
				+  inference_class = inference_engine.__class__.__name__
			
 
				+  shard = build_base_shard(model_name, inference_class)
			
 
				   if not shard:
			
 
				     print(f"Error: Unsupported model '{model_name}' for inference engine {inference_engine.__class__.__name__}")
			
 
				     return
			
 
				-  tokenizer = await resolve_tokenizer(shard.model_id)
			
 
				+  tokenizer = await resolve_tokenizer(get_repo(shard.model_id, inference_class))
			
 
				   request_id = str(uuid.uuid4())
			
 
				   callback_id = f"cli-wait-response-{request_id}"
			
 
				   callback = node.on_token.register(callback_id)
			
@@ -173,7 +182,7 @@ async def run_model_cli(node: Node, inference_engine: InferenceEngine, model_nam
 
				 
			
 
				   try:
			
 
				     print(f"Processing prompt: {prompt}")
			
 
				-    await node.process_prompt(shard, prompt, None, request_id=request_id)
			
 
				+    await node.process_prompt(shard, prompt, request_id=request_id)
			
 
				 
			
 
				     _, tokens, _ = await callback.wait(lambda _request_id, tokens, is_finished: _request_id == request_id and is_finished, timeout=300)
			
 
				 
			
@@ -189,12 +198,38 @@ async def run_model_cli(node: Node, inference_engine: InferenceEngine, model_nam
 
				 async def main():
			
 
				   loop = asyncio.get_running_loop()
			
 
				 
			
 
				+  # Check HuggingFace directory permissions
			
 
				+  hf_home, has_read, has_write = get_hf_home(), await has_hf_home_read_access(), await has_hf_home_write_access()
			
 
				+  if DEBUG >= 1: print(f"Model storage directory: {hf_home}")
			
 
				+  print(f"{has_read=}, {has_write=}")
			
 
				+  if not has_read or not has_write:
			
 
				+    print(f"""
			
 
				+          WARNING: Limited permissions for model storage directory: {hf_home}.
			
 
				+          This may prevent model downloads from working correctly.
			
 
				+          {"❌ No read access" if not has_read else ""}
			
 
				+          {"❌ No write access" if not has_write else ""}
			
 
				+          """)
			
 
				+    
			
 
				+  if not args.models_seed_dir is None:
			
 
				+    try:
			
 
				+      await move_models_to_hf(args.models_seed_dir)
			
 
				+    except Exception as e:
			
 
				+      print(f"Error moving models to .cache/huggingface: {e}")
			
 
				+
			
 
				+  def restore_cursor():
			
 
				+    if platform.system() != "Windows":
			
 
				+        os.system("tput cnorm")  # Show cursor
			
 
				+
			
 
				+  # Restore the cursor when the program exits
			
 
				+  atexit.register(restore_cursor)
			
 
				+
			
 
				   # Use a more direct approach to handle signals
			
 
				   def handle_exit():
			
 
				-    asyncio.ensure_future(shutdown(signal.SIGTERM, loop))
			
 
				+    asyncio.ensure_future(shutdown(signal.SIGTERM, loop, node.server))
			
 
				 
			
 
				-  for s in [signal.SIGINT, signal.SIGTERM]:
			
 
				-    loop.add_signal_handler(s, handle_exit)
			
 
				+  if platform.system() != "Windows":
			
 
				+    for s in [signal.SIGINT, signal.SIGTERM]:
			
 
				+      loop.add_signal_handler(s, handle_exit)
			
 
				 
			
 
				   await node.start(wait_for_peers=args.wait_for_peers)
			
 
				 
			
@@ -217,8 +252,9 @@ def run():
 
				   except KeyboardInterrupt:
			
 
				     print("Received keyboard interrupt. Shutting down...")
			
 
				   finally:
			
 
				-    loop.run_until_complete(shutdown(signal.SIGTERM, loop))
			
 
				+    loop.run_until_complete(shutdown(signal.SIGTERM, loop, node.server))
			
 
				     loop.close()
			
 
				 
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				   run()
			
--- a/exo/models.py
+++ b/exo/models.py
@@ -1,73 +1,148 @@
 
				 from exo.inference.shard import Shard
			
 
				+from typing import Optional, List
			
 
				 
			
 
				-model_base_shards = {
			
 
				+model_cards = {
			
 
				   ### llama
			
 
				   "llama-3.2-1b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Llama-3.2-1B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=16),
			
 
				+    "layers": 16,
			
 
				+    "repo": {
			
 
				+      "MLXDynamicShardInferenceEngine": "mlx-community/Llama-3.2-1B-Instruct-4bit",
			
 
				+      "TinygradDynamicShardInferenceEngine": "unsloth/Llama-3.2-1B-Instruct",
			
 
				+    },
			
 
				   },
			
 
				   "llama-3.2-3b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Llama-3.2-3B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=28),
			
 
				+    "layers": 28,
			
 
				+    "repo": {
			
 
				+       "MLXDynamicShardInferenceEngine": "mlx-community/Llama-3.2-3B-Instruct-4bit",
			
 
				+       "TinygradDynamicShardInferenceEngine": "unsloth/Llama-3.2-3B-Instruct",
			
 
				+    },
			
 
				   },
			
 
				   "llama-3.1-8b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated", start_layer=0, end_layer=0, n_layers=32),
			
 
				+    "layers": 32,
			
 
				+    "repo": {
			
 
				+       "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
			
 
				+       "TinygradDynamicShardInferenceEngine": "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated",
			
 
				+    },
			
 
				   },
			
 
				   "llama-3.1-70b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
			
 
				+    "layers": 80,
			
 
				+    "repo": {
			
 
				+       "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit",
			
 
				+       "TinygradDynamicShardInferenceEngine": "NousResearch/Meta-Llama-3.1-70B-Instruct",
			
 
				+    },
			
 
				   },
			
 
				   "llama-3.1-70b-bf16": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-70B-Instruct-bf16-CORRECTED", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="NousResearch/Meta-Llama-3.1-70B-Instruct", start_layer=0, end_layer=0, n_layers=80),
			
 
				+    "layers": 80,
			
 
				+    "repo": {
			
 
				+       "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3.1-70B-Instruct-bf16-CORRECTED",
			
 
				+       "TinygradDynamicShardInferenceEngine": "NousResearch/Meta-Llama-3.1-70B-Instruct",
			
 
				+    },
			
 
				   },
			
 
				-  "llama-3.1-405b": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3.1-405B-4bit", start_layer=0, end_layer=0, n_layers=126),},
			
 
				   "llama-3-8b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R", start_layer=0, end_layer=0, n_layers=32),
			
 
				+    "layers": 32,
			
 
				+    "repo": {
			
 
				+       "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3-8B-Instruct-4bit",
			
 
				+       "TinygradDynamicShardInferenceEngine": "TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-8B-R",
			
 
				+    },
			
 
				   },
			
 
				   "llama-3-70b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-    "TinygradDynamicShardInferenceEngine": Shard(model_id="TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", start_layer=0, end_layer=0, n_layers=80),
			
 
				-  },
			
 
				+    "layers": 80,
			
 
				+    "repo": {
			
 
				+       "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3-70B-Instruct-4bit",
			
 
				+       "TinygradDynamicShardInferenceEngine": "TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R",
			
 
				+    },
			
 
				+  },
			
 
				+  "llama-3.1-405b": { "layers": 126, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3.1-405B-4bit", }, },
			
 
				+  "llama-3.1-405b-8bit": { "layers": 126, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Meta-Llama-3.1-405B-Instruct-8bit", }, },
			
 
				   ### mistral
			
 
				-  "mistral-nemo": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Mistral-Nemo-Instruct-2407-4bit", start_layer=0, end_layer=0, n_layers=40),},
			
 
				-  "mistral-large": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Mistral-Large-Instruct-2407-4bit", start_layer=0, end_layer=0, n_layers=88),},
			
 
				+  "mistral-nemo": { "layers": 40, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Mistral-Nemo-Instruct-2407-4bit", }, },
			
 
				+  "mistral-large": { "layers": 88, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Mistral-Large-Instruct-2407-4bit", }, },
			
 
				   ### deepseek
			
 
				-  "deepseek-coder-v2-lite": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", start_layer=0, end_layer=0, n_layers=27),},
			
 
				-  "deepseek-coder-v2.5": {"MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", start_layer=0, end_layer=0, n_layers=60),},
			
 
				+  "deepseek-coder-v2-lite": { "layers": 27, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", }, },
			
 
				+  "deepseek-coder-v2.5": { "layers": 60, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", }, },
			
 
				   ### llava
			
 
				-  "llava-1.5-7b-hf": {"MLXDynamicShardInferenceEngine": Shard(model_id="llava-hf/llava-1.5-7b-hf", start_layer=0, end_layer=0, n_layers=32),},
			
 
				+  "llava-1.5-7b-hf": { "layers": 32, "repo": { "MLXDynamicShardInferenceEngine": "llava-hf/llava-1.5-7b-hf", }, },
			
 
				   ### qwen
			
 
				-  "qwen-2.5-coder-1.5b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-Coder-1.5B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=28),
			
 
				-  },
			
 
				-  "qwen-2.5-coder-7b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-Coder-7B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=28),
			
 
				-  },
			
 
				-  "qwen-2.5-7b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-7B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=28),
			
 
				-  },
			
 
				-  "qwen-2.5-math-7b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-Math-7B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=28),
			
 
				-  },
			
 
				-  "qwen-2.5-14b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-14B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=48),
			
 
				-  },
			
 
				-  "qwen-2.5-72b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-72B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-  },
			
 
				-  "qwen-2.5-math-72b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Qwen2.5-Math-72B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-  },
			
 
				+  "qwen-2.5-0.5b": { "layers": 28, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-0.5B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-coder-1.5b": { "layers": 28, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Coder-1.5B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-coder-3b": { "layers": 36, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Coder-3B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-coder-7b": { "layers": 28, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Coder-7B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-coder-14b": { "layers": 48, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Coder-14B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-coder-32b": { "layers": 64, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Coder-32B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-7b": { "layers": 28, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-7B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-math-7b": { "layers": 28, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Math-7B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-14b": { "layers": 48, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-14B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-72b": { "layers": 80, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-72B-Instruct-4bit", }, },
			
 
				+  "qwen-2.5-math-72b": { "layers": 80, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Qwen2.5-Math-72B-Instruct-4bit", }, },
			
 
				   ### nemotron
			
 
				-  "nemotron-70b": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF_4bit", start_layer=0, end_layer=0, n_layers=80),
			
 
				-  },
			
 
				-  "nemotron-70b-bf16": {
			
 
				-    "MLXDynamicShardInferenceEngine": Shard(model_id="mlx-community/Llama-3.1-Nemotron-70B-Instruct-HF-bf16", start_layer=0, end_layer=0, n_layers=80),
			
 
				-  },
			
 
				+  "nemotron-70b": { "layers": 80, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF_4bit", }, },
			
 
				+  "nemotron-70b-bf16": { "layers": 80, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/Llama-3.1-Nemotron-70B-Instruct-HF-bf16", }, },
			
 
				+  # gemma
			
 
				+  "gemma2-9b": { "layers": 42, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/gemma-2-9b-it-4bit", }, },
			
 
				+  "gemma2-27b": { "layers": 46, "repo": { "MLXDynamicShardInferenceEngine": "mlx-community/gemma-2-27b-it-4bit", }, },
			
 
				   # dummy
			
 
				-  "dummy": {
			
 
				-    "DummyInferenceEngine": Shard(model_id="dummy", start_layer=0, end_layer=7, n_layers=8),
			
 
				-  },
			
 
				+  "dummy": { "layers": 8, "repo": { "DummyInferenceEngine": "dummy", }, },
			
 
				+}
			
 
				+
			
 
				+pretty_name = {
			
 
				+  "llama-3.2-1b": "Llama 3.2 1B",
			
 
				+  "llama-3.2-3b": "Llama 3.2 3B",
			
 
				+  "llama-3.1-8b": "Llama 3.1 8B",
			
 
				+  "llama-3.1-70b": "Llama 3.1 70B",
			
 
				+  "llama-3.1-70b-bf16": "Llama 3.1 70B (BF16)",
			
 
				+  "llama-3.1-405b": "Llama 3.1 405B",
			
 
				+  "llama-3.1-405b-8bit": "Llama 3.1 405B (8-bit)",
			
 
				+  "gemma2-9b": "Gemma2 9B",
			
 
				+  "gemma2-27b": "Gemma2 27B",
			
 
				+  "nemotron-70b": "Nemotron 70B",
			
 
				+  "nemotron-70b-bf16": "Nemotron 70B (BF16)",
			
 
				+  "mistral-nemo": "Mistral Nemo",
			
 
				+  "mistral-large": "Mistral Large",
			
 
				+  "deepseek-coder-v2-lite": "Deepseek Coder V2 Lite",
			
 
				+  "deepseek-coder-v2.5": "Deepseek Coder V2.5",
			
 
				+  "llava-1.5-7b-hf": "LLaVa 1.5 7B (Vision Model)",
			
 
				+  "qwen-2.5-coder-1.5b": "Qwen 2.5 Coder 1.5B",
			
 
				+  "qwen-2.5-coder-3b": "Qwen 2.5 Coder 3B",
			
 
				+  "qwen-2.5-coder-7b": "Qwen 2.5 Coder 7B",
			
 
				+  "qwen-2.5-coder-14b": "Qwen 2.5 Coder 14B",
			
 
				+  "qwen-2.5-coder-32b": "Qwen 2.5 Coder 32B",
			
 
				+  "qwen-2.5-7b": "Qwen 2.5 7B",
			
 
				+  "qwen-2.5-math-7b": "Qwen 2.5 7B (Math)",
			
 
				+  "qwen-2.5-14b": "Qwen 2.5 14B",
			
 
				+  "qwen-2.5-72b": "Qwen 2.5 72B",
			
 
				+  "qwen-2.5-math-72b": "Qwen 2.5 72B (Math)",
			
 
				+  "llama-3-8b": "Llama 3 8B",
			
 
				+  "llama-3-70b": "Llama 3 70B",
			
 
				 }
			
 
				+
			
 
				+def get_repo(model_id: str, inference_engine_classname: str) -> Optional[str]:
			
 
				+  return model_cards.get(model_id, {}).get("repo", {}).get(inference_engine_classname, None)
			
 
				+
			
 
				+def build_base_shard(model_id: str, inference_engine_classname: str) -> Optional[Shard]:
			
 
				+  repo = get_repo(model_id, inference_engine_classname)
			
 
				+  n_layers = model_cards.get(model_id, {}).get("layers", 0)
			
 
				+  if repo is None or n_layers < 1:
			
 
				+    return None
			
 
				+  return Shard(model_id, 0, 0, n_layers)
			
 
				+
			
 
				+def get_supported_models(supported_inference_engine_lists: List[List[str]]) -> List[str]:
			
 
				+  if not supported_inference_engine_lists:
			
 
				+    return list(model_cards.keys())
			
 
				+
			
 
				+  from exo.inference.inference_engine import inference_engine_classes
			
 
				+  supported_inference_engine_lists = [
			
 
				+    [inference_engine_classes[engine] if engine in inference_engine_classes else engine for engine in engine_list]
			
 
				+    for engine_list in supported_inference_engine_lists
			
 
				+  ]
			
 
				+
			
 
				+  def has_any_engine(model_info: dict, engine_list: List[str]) -> bool:
			
 
				+    return any(engine in model_info.get("repo", {}) for engine in engine_list)
			
 
				+
			
 
				+  def supports_all_engine_lists(model_info: dict) -> bool:
			
 
				+    return all(has_any_engine(model_info, engine_list)
			
 
				+              for engine_list in supported_inference_engine_lists)
			
 
				+
			
 
				+  return [
			
 
				+    model_id for model_id, model_info in model_cards.items()
			
 
				+    if supports_all_engine_lists(model_info)
			
 
				+  ]
			
--- a/exo/networking/grpc/grpc_peer_handle.py
+++ b/exo/networking/grpc/grpc_peer_handle.py
@@ -9,7 +9,7 @@ from . import node_service_pb2_grpc
 
				 from ..peer_handle import PeerHandle
			
 
				 from exo.inference.shard import Shard
			
 
				 from exo.topology.topology import Topology
			
 
				-from exo.topology.device_capabilities import DeviceCapabilities
			
 
				+from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
			
 
				 from exo.helpers import DEBUG
			
 
				 
			
 
				 
			
@@ -32,7 +32,11 @@ class GRPCPeerHandle(PeerHandle):
 
				 
			
 
				   async def connect(self):
			
 
				     if self.channel is None:
			
 
				-      self.channel = grpc.aio.insecure_channel(self.address, options=[("grpc.max_metadata_size", 32*1024*1024)])
			
 
				+      self.channel = grpc.aio.insecure_channel(self.address, options=[
			
 
				+        ("grpc.max_metadata_size", 32*1024*1024),
			
 
				+        ('grpc.max_receive_message_length', 32*1024*1024),
			
 
				+        ('grpc.max_send_message_length', 32*1024*1024)
			
 
				+      ])
			
 
				       self.stub = node_service_pb2_grpc.NodeServiceStub(self.channel)
			
 
				     await self.channel.channel_ready()
			
 
				 
			
@@ -63,10 +67,9 @@ class GRPCPeerHandle(PeerHandle):
 
				         traceback.print_exc()
			
 
				       return False
			
 
				 
			
 
				-  async def send_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				+  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None) -> Optional[np.array]:
			
 
				     request = node_service_pb2.PromptRequest(
			
 
				       prompt=prompt,
			
 
				-      image_str=image_str,
			
 
				       shard=node_service_pb2.Shard(
			
 
				         model_id=shard.model_id,
			
 
				         start_layer=shard.start_layer,
			
@@ -74,7 +77,6 @@ class GRPCPeerHandle(PeerHandle):
 
				         n_layers=shard.n_layers,
			
 
				       ),
			
 
				       request_id=request_id,
			
 
				-      inference_state=inference_state,
			
 
				     )
			
 
				     response = await self.stub.SendPrompt(request)
			
 
				 
			
@@ -83,7 +85,7 @@ class GRPCPeerHandle(PeerHandle):
 
				 
			
 
				     return np.frombuffer(response.tensor_data, dtype=np.dtype(response.dtype)).reshape(response.shape)
			
 
				 
			
 
				-  async def send_tensor(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				+  async def send_tensor(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None) -> Optional[np.array]:
			
 
				     request = node_service_pb2.TensorRequest(
			
 
				       shard=node_service_pb2.Shard(
			
 
				         model_id=shard.model_id,
			
@@ -93,7 +95,6 @@ class GRPCPeerHandle(PeerHandle):
 
				       ),
			
 
				       tensor=node_service_pb2.Tensor(tensor_data=tensor.tobytes(), shape=tensor.shape, dtype=str(tensor.dtype)),
			
 
				       request_id=request_id,
			
 
				-      inference_state=inference_state,
			
 
				     )
			
 
				     response = await self.stub.SendTensor(request)
			
 
				 
			
@@ -117,7 +118,9 @@ class GRPCPeerHandle(PeerHandle):
 
				     response = await self.stub.CollectTopology(request)
			
 
				     topology = Topology()
			
 
				     for node_id, capabilities in response.nodes.items():
			
 
				-      device_capabilities = DeviceCapabilities(model=capabilities.model, chip=capabilities.chip, memory=capabilities.memory, flops=capabilities.flops)
			
 
				+      device_capabilities = DeviceCapabilities(
			
 
				+        model=capabilities.model, chip=capabilities.chip, memory=capabilities.memory, flops=DeviceFlops(fp16=capabilities.flops.fp16, fp32=capabilities.flops.fp32, int8=capabilities.flops.int8)
			
 
				+      )
			
 
				       topology.update_node(node_id, device_capabilities)
			
 
				     for node_id, peers in response.peer_graph.items():
			
 
				       for peer_id in peers.peer_ids:
			
--- a/exo/networking/grpc/grpc_server.py
+++ b/exo/networking/grpc/grpc_server.py
@@ -49,10 +49,9 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				       n_layers=request.shard.n_layers,
			
 
				     )
			
 
				     prompt = request.prompt
			
 
				-    image_str = request.image_str
			
 
				     request_id = request.request_id
			
 
				-    result = await self.node.process_prompt(shard, prompt, image_str, request_id)
			
 
				-    if DEBUG >= 5: print(f"SendPrompt {shard=} {prompt=} {image_str=} {request_id=} result: {result}")
			
 
				+    result = await self.node.process_prompt(shard, prompt, request_id)
			
 
				+    if DEBUG >= 5: print(f"SendPrompt {shard=} {prompt=} {request_id=} result: {result}")
			
 
				     tensor_data = result.tobytes() if result is not None else None
			
 
				     return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype)) if result is not None else node_service_pb2.Tensor()
			
 
				 
			
@@ -65,9 +64,8 @@ class GRPCServer(node_service_pb2_grpc.NodeServiceServicer):
 
				     )
			
 
				     tensor = np.frombuffer(request.tensor.tensor_data, dtype=np.dtype(request.tensor.dtype)).reshape(request.tensor.shape)
			
 
				     request_id = request.request_id
			
 
				-    inference_state = request.inference_state
			
 
				 
			
 
				-    result = await self.node.process_tensor(shard, tensor, request_id, inference_state)
			
 
				+    result = await self.node.process_tensor(shard, tensor, request_id)
			
 
				     if DEBUG >= 5: print(f"SendTensor tensor {shard=} {tensor=} {request_id=} result: {result}")
			
 
				     tensor_data = result.tobytes() if result is not None else None
			
 
				     return node_service_pb2.Tensor(tensor_data=tensor_data, shape=result.shape, dtype=str(result.dtype)) if result is not None else node_service_pb2.Tensor()
			
--- a/exo/networking/grpc/node_service.proto
+++ b/exo/networking/grpc/node_service.proto
@@ -22,16 +22,13 @@ message Shard {
 
				 message PromptRequest {
			
 
				   Shard shard = 1;
			
 
				   string prompt = 2;
			
 
				-  optional string image_str = 3;
			
 
				-  optional string request_id = 4;
			
 
				-  optional string inference_state = 5;
			
 
				+  optional string request_id = 3;
			
 
				 }
			
 
				 
			
 
				 message TensorRequest {
			
 
				   Shard shard = 1;
			
 
				   Tensor tensor = 2;
			
 
				   optional string request_id = 3;
			
 
				-  optional string inference_state = 4;
			
 
				 }
			
 
				 
			
 
				 message GetInferenceResultRequest {
			
@@ -93,4 +90,4 @@ message HealthCheckResponse {
 
				   bool is_healthy = 1;
			
 
				 }
			
 
				 
			
 
				-message Empty {}
			
 
				+message Empty {}
			
--- a/exo/networking/grpc/node_service_pb2.py
+++ b/exo/networking/grpc/node_service_pb2.py
--- a/exo/networking/manual/manual_discovery.py
+++ b/exo/networking/manual/manual_discovery.py
@@ -19,7 +19,9 @@ class ManualDiscovery(Discovery):
 
				     self.create_peer_handle = create_peer_handle
			
 
				 
			
 
				     if node_id not in self.topology.peers:
			
 
				-      raise ValueError(f"Node ID {node_id} not found in network config file {network_config_path}. Please run with `node_id` set to one of the keys in the config file: {[k for k, _ in self.topology.peers]}")
			
 
				+      raise ValueError(
			
 
				+        f"Node ID {node_id} not found in network config file {network_config_path}. Please run with `node_id` set to one of the keys in the config file: {[k for k, _ in self.topology.peers]}"
			
 
				+      )
			
 
				 
			
 
				     self.listen_task = None
			
 
				 
			
@@ -42,7 +44,6 @@ class ManualDiscovery(Discovery):
 
				     if DEBUG_DISCOVERY >= 2: print(f"Discovered peers: {[peer.id() for peer in self.known_peers.values()]}")
			
 
				     return list(self.known_peers.values())
			
 
				 
			
 
				-
			
 
				   async def task_find_peers_from_config(self):
			
 
				     if DEBUG_DISCOVERY >= 2: print("Starting task to find peers from config...")
			
 
				     while True:
			
@@ -52,18 +53,19 @@ class ManualDiscovery(Discovery):
 
				           peer = self.known_peers.get(peer_id)
			
 
				           if not peer:
			
 
				             if DEBUG_DISCOVERY >= 2: print(f"{peer_id=} not found in known peers. Adding.")
			
 
				-            peer = self.create_peer_handle(peer_id, f"{peer_config.address}:{peer_config.port}", peer_config.device_capabilities)  
			
 
				+            peer = self.create_peer_handle(peer_id, f"{peer_config.address}:{peer_config.port}", peer_config.device_capabilities)
			
 
				           is_healthy = await peer.health_check()
			
 
				           if is_healthy:
			
 
				             if DEBUG_DISCOVERY >= 2: print(f"{peer_id=} at {peer_config.address}:{peer_config.port} is healthy.")
			
 
				             self.known_peers[peer_id] = peer
			
 
				           else:
			
 
				             if DEBUG_DISCOVERY >= 2: print(f"{peer_id=} at {peer_config.address}:{peer_config.port} is not healthy.")
			
 
				-            try: del self.known_peers[peer_id]
			
 
				-            except KeyError: pass
			
 
				+            try:
			
 
				+              del self.known_peers[peer_id]
			
 
				+            except KeyError:
			
 
				+              pass
			
 
				         except Exception as e:
			
 
				-            if DEBUG_DISCOVERY >= 2: print(f"Exception occured when attempting to add {peer_id=}: {e}")
			
 
				+          if DEBUG_DISCOVERY >= 2: print(f"Exception occured when attempting to add {peer_id=}: {e}")
			
 
				       await asyncio.sleep(1.0)
			
 
				 
			
 
				       if DEBUG_DISCOVERY >= 2: print(f"Current known peers: {[peer.id() for peer in self.known_peers.values()]}")
			
 
				-
			
--- a/exo/networking/manual/network_topology_config.py
+++ b/exo/networking/manual/network_topology_config.py
@@ -17,7 +17,6 @@ class NetworkTopology(BaseModel):
 
				   """
			
 
				   node_id to PeerConfig. The node_id is used to identify the peer in the discovery process. The node that this is running from should be included in this dict.
			
 
				   """
			
 
				-
			
 
				   @classmethod
			
 
				   def from_path(cls, path: str) -> "NetworkTopology":
			
 
				     try:
			
--- a/exo/networking/peer_handle.py
+++ b/exo/networking/peer_handle.py
@@ -5,6 +5,7 @@ from exo.inference.shard import Shard
 
				 from exo.topology.device_capabilities import DeviceCapabilities
			
 
				 from exo.topology.topology import Topology
			
 
				 
			
 
				+
			
 
				 class PeerHandle(ABC):
			
 
				   @abstractmethod
			
 
				   def id(self) -> str:
			
@@ -35,11 +36,11 @@ class PeerHandle(ABC):
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def send_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				+  async def send_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None) -> Optional[np.array]:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def send_tensor(self, shard: Shard, tensor: np.array, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.array]:
			
 
				+  async def send_tensor(self, shard: Shard, tensor: np.array, request_id: Optional[str] = None) -> Optional[np.array]:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
--- a/exo/networking/tailscale/tailscale_discovery.py
+++ b/exo/networking/tailscale/tailscale_discovery.py
@@ -8,6 +8,7 @@ from exo.topology.device_capabilities import DeviceCapabilities, device_capabili
 
				 from exo.helpers import DEBUG, DEBUG_DISCOVERY
			
 
				 from .tailscale_helpers import get_device_id, update_device_attributes, get_device_attributes, get_tailscale_devices, Device
			
 
				 
			
 
				+
			
 
				 class TailscaleDiscovery(Discovery):
			
 
				   def __init__(
			
 
				     self,
			
@@ -69,14 +70,11 @@ class TailscaleDiscovery(Discovery):
 
				         devices: dict[str, Device] = await get_tailscale_devices(self.tailscale_api_key, self.tailnet)
			
 
				         current_time = time.time()
			
 
				 
			
 
				-        active_devices = {
			
 
				-          name: device for name, device in devices.items()
			
 
				-          if device.last_seen is not None and (current_time - device.last_seen.timestamp()) < 30
			
 
				-        }
			
 
				+        active_devices = {name: device for name, device in devices.items() if device.last_seen is not None and (current_time - device.last_seen.timestamp()) < 30}
			
 
				 
			
 
				         if DEBUG_DISCOVERY >= 4: print(f"Found tailscale devices: {devices}")
			
 
				         if DEBUG_DISCOVERY >= 2: print(f"Active tailscale devices: {len(active_devices)}/{len(devices)}")
			
 
				-        if DEBUG_DISCOVERY >= 2: print("Time since last seen tailscale devices", [(current_time  - device.last_seen.timestamp()) for device in devices.values()])
			
 
				+        if DEBUG_DISCOVERY >= 2: print("Time since last seen tailscale devices", [(current_time - device.last_seen.timestamp()) for device in devices.values()])
			
 
				 
			
 
				         for device in active_devices.values():
			
 
				           if device.name == self.node_id: continue
			
@@ -141,7 +139,13 @@ class TailscaleDiscovery(Discovery):
 
				         for peer_id, should_remove in zip(peer_ids, results):
			
 
				           if should_remove: peers_to_remove.append(peer_id)
			
 
				 
			
 
				-        if DEBUG_DISCOVERY >= 2: print("Peer statuses:", { peer_handle.id(): f"is_connected={await peer_handle.is_connected()}, health_check={await peer_handle.health_check()}, connected_at={connected_at}, last_seen={last_seen}" for peer_handle, connected_at, last_seen in self.known_peers.values() })
			
 
				+        if DEBUG_DISCOVERY >= 2:
			
 
				+          print(
			
 
				+            "Peer statuses:", {
			
 
				+              peer_handle.id(): f"is_connected={await peer_handle.is_connected()}, health_check={await peer_handle.health_check()}, connected_at={connected_at}, last_seen={last_seen}"
			
 
				+              for peer_handle, connected_at, last_seen in self.known_peers.values()
			
 
				+            }
			
 
				+          )
			
 
				 
			
 
				         for peer_id in peers_to_remove:
			
 
				           if peer_id in self.known_peers:
			
@@ -164,9 +168,5 @@ class TailscaleDiscovery(Discovery):
 
				       if DEBUG_DISCOVERY >= 2: print(f"Error checking peer {peer_id}: {e}")
			
 
				       return True
			
 
				 
			
 
				-    should_remove = (
			
 
				-      (not is_connected and current_time - connected_at > self.discovery_timeout) or
			
 
				-      (current_time - last_seen > self.discovery_timeout) or
			
 
				-      (not health_ok)
			
 
				-    )
			
 
				+    should_remove = ((not is_connected and current_time - connected_at > self.discovery_timeout) or (current_time - last_seen > self.discovery_timeout) or (not health_ok))
			
 
				     return should_remove
			
--- a/exo/networking/tailscale/tailscale_helpers.py
+++ b/exo/networking/tailscale/tailscale_helpers.py
@@ -7,6 +7,7 @@ from exo.helpers import DEBUG_DISCOVERY
 
				 from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
			
 
				 from datetime import datetime, timezone
			
 
				 
			
 
				+
			
 
				 class Device:
			
 
				   def __init__(self, device_id: str, name: str, addresses: List[str], last_seen: Optional[datetime] = None):
			
 
				     self.device_id = device_id
			
@@ -16,12 +17,7 @@ class Device:
 
				 
			
 
				   @classmethod
			
 
				   def from_dict(cls, data: Dict[str, Any]) -> 'Device':
			
 
				-    return cls(
			
 
				-      device_id=data.get('id', ''),
			
 
				-      name=data.get('name', ''),
			
 
				-      addresses=data.get('addresses', []),
			
 
				-      last_seen=cls.parse_datetime(data.get('lastSeen'))
			
 
				-    )
			
 
				+    return cls(device_id=data.get('id', ''), name=data.get('name', ''), addresses=data.get('addresses', []), last_seen=cls.parse_datetime(data.get('lastSeen')))
			
 
				 
			
 
				   @staticmethod
			
 
				   def parse_datetime(date_string: Optional[str]) -> Optional[datetime]:
			
@@ -29,13 +25,10 @@ class Device:
 
				       return None
			
 
				     return datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=timezone.utc)
			
 
				 
			
 
				+
			
 
				 async def get_device_id() -> str:
			
 
				   try:
			
 
				-    process = await asyncio.create_subprocess_exec(
			
 
				-      'tailscale', 'status', '--json',
			
 
				-      stdout=asyncio.subprocess.PIPE,
			
 
				-      stderr=asyncio.subprocess.PIPE
			
 
				-    )
			
 
				+    process = await asyncio.create_subprocess_exec('tailscale', 'status', '--json', stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
			
 
				     stdout, stderr = await process.communicate()
			
 
				     if process.returncode != 0:
			
 
				       raise Exception(f"Command failed with exit code {process.returncode}: {stderr.decode().strip()}.")
			
@@ -45,22 +38,16 @@ async def get_device_id() -> str:
 
				   except Exception as e:
			
 
				     raise Exception(f"{str(e)} Do you have the tailscale cli installed? See: https://tailscale.com/kb/1080/cli")
			
 
				 
			
 
				+
			
 
				 async def update_device_attributes(device_id: str, api_key: str, node_id: str, node_port: int, device_capabilities: DeviceCapabilities):
			
 
				   async with aiohttp.ClientSession() as session:
			
 
				     base_url = f"https://api.tailscale.com/api/v2/device/{device_id}/attributes"
			
 
				-    headers = {
			
 
				-      'Authorization': f'Bearer {api_key}',
			
 
				-      'Content-Type': 'application/json'
			
 
				-    }
			
 
				+    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
			
 
				 
			
 
				     attributes = {
			
 
				-      "custom:exo_node_id": node_id.replace('-', '_'),
			
 
				-      "custom:exo_node_port": node_port,
			
 
				-      "custom:exo_device_capability_chip": sanitize_attribute(device_capabilities.chip),
			
 
				-      "custom:exo_device_capability_model": sanitize_attribute(device_capabilities.model),
			
 
				-      "custom:exo_device_capability_memory": str(device_capabilities.memory),
			
 
				-      "custom:exo_device_capability_flops_fp16": str(device_capabilities.flops.fp16),
			
 
				-      "custom:exo_device_capability_flops_fp32": str(device_capabilities.flops.fp32),
			
 
				+      "custom:exo_node_id": node_id.replace('-', '_'), "custom:exo_node_port": node_port, "custom:exo_device_capability_chip": sanitize_attribute(device_capabilities.chip),
			
 
				+      "custom:exo_device_capability_model": sanitize_attribute(device_capabilities.model), "custom:exo_device_capability_memory": str(device_capabilities.memory),
			
 
				+      "custom:exo_device_capability_flops_fp16": str(device_capabilities.flops.fp16), "custom:exo_device_capability_flops_fp32": str(device_capabilities.flops.fp32),
			
 
				       "custom:exo_device_capability_flops_int8": str(device_capabilities.flops.int8)
			
 
				     }
			
 
				 
			
@@ -73,12 +60,11 @@ async def update_device_attributes(device_id: str, api_key: str, node_id: str, n
 
				         else:
			
 
				           print(f"Failed to update device posture attribute {attr_name}: {response.status} {await response.text()}")
			
 
				 
			
 
				+
			
 
				 async def get_device_attributes(device_id: str, api_key: str) -> Tuple[str, int, DeviceCapabilities]:
			
 
				   async with aiohttp.ClientSession() as session:
			
 
				     url = f"https://api.tailscale.com/api/v2/device/{device_id}/attributes"
			
 
				-    headers = {
			
 
				-      'Authorization': f'Bearer {api_key}'
			
 
				-    }
			
 
				+    headers = {'Authorization': f'Bearer {api_key}'}
			
 
				     async with session.get(url, headers=headers) as response:
			
 
				       if response.status == 200:
			
 
				         data = await response.json()
			
@@ -100,6 +86,7 @@ async def get_device_attributes(device_id: str, api_key: str) -> Tuple[str, int,
 
				         print(f"Failed to fetch posture attributes for {device_id}: {response.status}")
			
 
				         return "", 0, DeviceCapabilities(model="", chip="", memory=0, flops=DeviceFlops(fp16=0, fp32=0, int8=0))
			
 
				 
			
 
				+
			
 
				 def parse_device_attributes(data: Dict[str, str]) -> Dict[str, Any]:
			
 
				   result = {}
			
 
				   prefix = "custom:exo_"
			
@@ -112,12 +99,14 @@ def parse_device_attributes(data: Dict[str, str]) -> Dict[str, Any]:
 
				         result[attr_name] = float(value)
			
 
				   return result
			
 
				 
			
 
				+
			
 
				 def sanitize_attribute(value: str) -> str:
			
 
				   # Replace invalid characters with underscores
			
 
				   sanitized_value = re.sub(r'[^a-zA-Z0-9_.]', '_', value)
			
 
				   # Truncate to 50 characters
			
 
				   return sanitized_value[:50]
			
 
				 
			
 
				+
			
 
				 async def get_tailscale_devices(api_key: str, tailnet: str) -> Dict[str, Device]:
			
 
				   async with aiohttp.ClientSession() as session:
			
 
				     url = f"https://api.tailscale.com/api/v2/tailnet/{tailnet}/devices"
			
@@ -133,4 +122,4 @@ async def get_tailscale_devices(api_key: str, tailnet: str) -> Dict[str, Device]
 
				         device = Device.from_dict(device_data)
			
 
				         devices[device.name] = device
			
 
				 
			
 
				-      return devices
			
 
				+      return devices
			
--- a/exo/networking/tailscale/test_tailscale_discovery.py
+++ b/exo/networking/tailscale/test_tailscale_discovery.py
@@ -5,6 +5,7 @@ from unittest import mock
 
				 from exo.networking.tailscale.tailscale_discovery import TailscaleDiscovery
			
 
				 from exo.networking.peer_handle import PeerHandle
			
 
				 
			
 
				+
			
 
				 class TestTailscaleDiscovery(unittest.IsolatedAsyncioTestCase):
			
 
				   async def asyncSetUp(self):
			
 
				     self.tailscale_api_key = os.environ.get("TAILSCALE_API_KEY", "")
			
@@ -37,5 +38,6 @@ class TestTailscaleDiscovery(unittest.IsolatedAsyncioTestCase):
 
				     # Check if discovered peers are instances of GRPCPeerHandle
			
 
				     print(peers)
			
 
				 
			
 
				+
			
 
				 if __name__ == '__main__':
			
 
				   unittest.main()
			
--- a/exo/networking/udp/udp_discovery.py
+++ b/exo/networking/udp/udp_discovery.py
@@ -9,6 +9,7 @@ from exo.networking.peer_handle import PeerHandle
 
				 from exo.topology.device_capabilities import DeviceCapabilities, device_capabilities, UNKNOWN_DEVICE_CAPABILITIES
			
 
				 from exo.helpers import DEBUG, DEBUG_DISCOVERY, get_all_ip_addresses
			
 
				 
			
 
				+
			
 
				 class ListenProtocol(asyncio.DatagramProtocol):
			
 
				   def __init__(self, on_message: Callable[[bytes, Tuple[str, int]], Coroutine]):
			
 
				     super().__init__()
			
@@ -90,17 +91,13 @@ class UDPDiscovery(Discovery):
 
				           "node_id": self.node_id,
			
 
				           "grpc_port": self.node_port,
			
 
				           "device_capabilities": self.device_capabilities.to_dict(),
			
 
				-          "priority": 1, # For now, every interface has the same priority. We can make this better by prioriting interfaces based on bandwidth, latency, and jitter e.g. prioritise Thunderbolt over WiFi.
			
 
				+          "priority": 1,  # For now, every interface has the same priority. We can make this better by prioriting interfaces based on bandwidth, latency, and jitter e.g. prioritise Thunderbolt over WiFi.
			
 
				         })
			
 
				         if DEBUG_DISCOVERY >= 3: print(f"Broadcasting presence at ({addr}): {message}")
			
 
				 
			
 
				         transport = None
			
 
				         try:
			
 
				-          transport, _ = await asyncio.get_event_loop().create_datagram_endpoint(
			
 
				-            lambda: BroadcastProtocol(message, self.broadcast_port),
			
 
				-            local_addr=(addr, 0),
			
 
				-            family=socket.AF_INET
			
 
				-          )
			
 
				+          transport, _ = await asyncio.get_event_loop().create_datagram_endpoint(lambda: BroadcastProtocol(message, self.broadcast_port), local_addr=(addr, 0), family=socket.AF_INET)
			
 
				           if DEBUG_DISCOVERY >= 3:
			
 
				             print(f"Broadcasting presence at ({addr})")
			
 
				         except Exception as e:
			
@@ -145,7 +142,8 @@ class UDPDiscovery(Discovery):
 
				         if peer_id in self.known_peers:
			
 
				           existing_peer_prio = self.known_peers[peer_id][3]
			
 
				           if existing_peer_prio >= peer_prio:
			
 
				-            if DEBUG >= 1: print(f"Ignoring peer {peer_id} at {peer_host}:{peer_port} with priority {peer_prio} because we already know about a peer with higher or equal priority: {existing_peer_prio}")
			
 
				+            if DEBUG >= 1:
			
 
				+              print(f"Ignoring peer {peer_id} at {peer_host}:{peer_port} with priority {peer_prio} because we already know about a peer with higher or equal priority: {existing_peer_prio}")
			
 
				             return
			
 
				         new_peer_handle = self.create_peer_handle(peer_id, f"{peer_host}:{peer_port}", device_capabilities)
			
 
				         if not await new_peer_handle.health_check():
			
@@ -161,8 +159,7 @@ class UDPDiscovery(Discovery):
 
				         if peer_id in self.known_peers: self.known_peers[peer_id] = (self.known_peers[peer_id][0], self.known_peers[peer_id][1], time.time(), peer_prio)
			
 
				 
			
 
				   async def task_listen_for_peers(self):
			
 
				-    await asyncio.get_event_loop().create_datagram_endpoint(lambda: ListenProtocol(self.on_listen_message),
			
 
				-                                                            local_addr=("0.0.0.0", self.listen_port))
			
 
				+    await asyncio.get_event_loop().create_datagram_endpoint(lambda: ListenProtocol(self.on_listen_message), local_addr=("0.0.0.0", self.listen_port))
			
 
				     if DEBUG_DISCOVERY >= 2: print("Started listen task")
			
 
				 
			
 
				   async def task_cleanup_peers(self):
			
@@ -177,7 +174,13 @@ class UDPDiscovery(Discovery):
 
				         for peer_id, should_remove in zip(peer_ids, results):
			
 
				           if should_remove: peers_to_remove.append(peer_id)
			
 
				 
			
 
				-        if DEBUG_DISCOVERY >= 2: print("Peer statuses:", { peer_handle.id(): f"is_connected={await peer_handle.is_connected()}, health_check={await peer_handle.health_check()}, connected_at={connected_at}, last_seen={last_seen}, prio={prio}" for peer_handle, connected_at, last_seen, prio in self.known_peers.values() })
			
 
				+        if DEBUG_DISCOVERY >= 2:
			
 
				+          print(
			
 
				+            "Peer statuses:", {
			
 
				+              peer_handle.id(): f"is_connected={await peer_handle.is_connected()}, health_check={await peer_handle.health_check()}, connected_at={connected_at}, last_seen={last_seen}, prio={prio}"
			
 
				+              for peer_handle, connected_at, last_seen, prio in self.known_peers.values()
			
 
				+            }
			
 
				+          )
			
 
				 
			
 
				         for peer_id in peers_to_remove:
			
 
				           if peer_id in self.known_peers:
			
@@ -200,9 +203,5 @@ class UDPDiscovery(Discovery):
 
				       if DEBUG_DISCOVERY >= 2: print(f"Error checking peer {peer_id}: {e}")
			
 
				       return True
			
 
				 
			
 
				-    should_remove = (
			
 
				-      (not is_connected and current_time - connected_at > self.discovery_timeout) or
			
 
				-      (current_time - last_seen > self.discovery_timeout) or
			
 
				-      (not health_ok)
			
 
				-    )
			
 
				+    should_remove = ((not is_connected and current_time - connected_at > self.discovery_timeout) or (current_time - last_seen > self.discovery_timeout) or (not health_ok))
			
 
				     return should_remove
			
--- a/exo/orchestration/node.py
+++ b/exo/orchestration/node.py
@@ -16,11 +16,11 @@ class Node(ABC):
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def process_prompt(self, shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+  async def process_prompt(self, shard: Shard, prompt: str, request_id: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
 
				-  async def process_tensor(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+  async def process_tensor(self, shard: Shard, tensor: np.ndarray, request_id: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     pass
			
 
				 
			
 
				   @abstractmethod
			
--- a/exo/orchestration/standard_node.py
+++ b/exo/orchestration/standard_node.py
@@ -39,6 +39,8 @@ class StandardNode(Node):
 
				     self.topology: Topology = Topology()
			
 
				     self.device_capabilities = device_capabilities()
			
 
				     self.buffered_token_output: Dict[str, Tuple[List[int], bool]] = {}
			
 
				+    self.buffered_logits: Dict[str, List[np.ndarray]] = {}
			
 
				+    self.buffered_inputs: Dict[str, List[np.ndarray]] = {}
			
 
				     self.max_generate_tokens = max_generate_tokens
			
 
				     self.topology_viz = topology_viz
			
 
				     self._on_token = AsyncCallbackSystem[str, Tuple[str, List[int], bool]]()
			
@@ -87,24 +89,53 @@ class StandardNode(Node):
 
				   def get_supported_inference_engines(self):
			
 
				     supported_engine_names = []
			
 
				     if self.inference_engine.__class__.__name__ == 'MLXDynamicShardInferenceEngine':
			
 
				-        supported_engine_names.append('mlx')
			
 
				-        supported_engine_names.append('tinygrad')
			
 
				+      supported_engine_names.append('mlx')
			
 
				+      supported_engine_names.append('tinygrad')
			
 
				     else:
			
 
				-        supported_engine_names.append('tinygrad')
			
 
				+      supported_engine_names.append('tinygrad')
			
 
				     return supported_engine_names
			
 
				 
			
 
				   async def broadcast_supported_engines(self, supported_engines_names: List[str]):
			
 
				-    status_message = json.dumps({
			
 
				-        "type": "supported_inference_engines",
			
 
				-        "node_id": self.id,
			
 
				-        "engines": supported_engines_names
			
 
				-    })
			
 
				+    status_message = json.dumps({"type": "supported_inference_engines", "node_id": self.id, "engines": supported_engines_names})
			
 
				     await self.broadcast_opaque_status("", status_message)
			
 
				 
			
 
				   def get_topology_inference_engines(self) -> List[List[str]]:
			
 
				     return self.topology_inference_engines_pool
			
 
				+  
			
 
				+  async def process_inference_result(
			
 
				+    self,
			
 
				+    shard,
			
 
				+    result: np.ndarray,
			
 
				+    request_id: Optional[str] = None,
			
 
				+  ):
			
 
				+    if request_id not in self.buffered_token_output:
			
 
				+      self.buffered_token_output[request_id] = ([], False)
			
 
				+    is_finished = len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				+    if shard.is_last_layer() and not is_finished:
			
 
				+      token = await self.inference_engine.sample(result)
			
 
				+      await self.inference_engine.ensure_shard(shard)
			
 
				+      self.buffered_token_output[request_id][0].append(token.item())
			
 
				+      if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
			
 
				+      is_finished = token.item() == self.inference_engine.tokenizer.eos_token_id
			
 
				+      forward = token.reshape(1, -1)
			
 
				+      self.trigger_on_token_callbacks(request_id, self.buffered_token_output[request_id][0], is_finished)
			
 
				+      asyncio.create_task(self.broadcast_result(request_id, self.buffered_token_output[request_id][0], is_finished))
			
 
				+    else:
			
 
				+      forward = result
			
 
				 
			
 
				-  async def process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+    if is_finished:
			
 
				+      self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
			
 
				+    else:
			
 
				+      asyncio.create_task(self.forward_tensor(shard, forward, request_id, self.get_partition_index(offset = 1)))
			
 
				+
			
 
				+    return np.array(self.buffered_token_output[request_id][0])
			
 
				+
			
 
				+  async def process_prompt(
			
 
				+    self,
			
 
				+    base_shard: Shard,
			
 
				+    prompt: str,
			
 
				+    request_id: Optional[str] = None,
			
 
				+  ) -> Optional[np.ndarray]:
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				     asyncio.create_task(
			
 
				       self.broadcast_opaque_status(
			
@@ -116,14 +147,12 @@ class StandardNode(Node):
 
				           "base_shard": base_shard.to_dict(),
			
 
				           "shard": shard.to_dict(),
			
 
				           "prompt": prompt,
			
 
				-          "image_str": image_str,
			
 
				-          "inference_state": inference_state,
			
 
				           "request_id": request_id,
			
 
				         }),
			
 
				       )
			
 
				     )
			
 
				     start_time = time.perf_counter_ns()
			
 
				-    resp = await self._process_prompt(base_shard, prompt, image_str, request_id, inference_state)
			
 
				+    resp = await self._process_prompt(base_shard, prompt, request_id)
			
 
				     end_time = time.perf_counter_ns()
			
 
				     elapsed_time_ns = end_time - start_time
			
 
				     asyncio.create_task(
			
@@ -136,8 +165,6 @@ class StandardNode(Node):
 
				           "base_shard": base_shard.to_dict(),
			
 
				           "shard": shard.to_dict(),
			
 
				           "prompt": prompt,
			
 
				-          "image_str": image_str,
			
 
				-          "inference_state": inference_state,
			
 
				           "request_id": request_id,
			
 
				           "elapsed_time_ns": elapsed_time_ns,
			
 
				           "result_size": resp.size if resp is not None else 0,
			
@@ -146,42 +173,26 @@ class StandardNode(Node):
 
				     )
			
 
				     return resp
			
 
				 
			
 
				-  async def _process_prompt(self, base_shard: Shard, prompt: str, image_str: Optional[str] = None, request_id: Optional[str] = None, inference_state: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				+  async def _process_prompt(self, base_shard: Shard, prompt: str, request_id: Optional[str] = None) -> Optional[np.ndarray]:
			
 
				     if request_id is None:
			
 
				       request_id = str(uuid.uuid4())
			
 
				-    if request_id not in self.buffered_token_output:
			
 
				-      self.buffered_token_output[request_id] = ([], False)
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				 
			
 
				-    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=} {image_str=}")
			
 
				-    if shard.start_layer != 0:
			
 
				-      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=} {image_str=}")
			
 
				-      await self.forward_to_next_shard(shard, prompt, request_id, image_str=image_str, inference_state=inference_state)
			
 
				-      return
			
 
				-
			
 
				-    result, inference_state, is_finished = await self.inference_engine.infer_prompt(request_id, shard, prompt, image_str, inference_state=inference_state)
			
 
				-    is_finished = is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				-    if is_finished:
			
 
				-      self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
			
 
				-    asyncio.create_task(self.broadcast_result(request_id, self.buffered_token_output[request_id][0], is_finished))  # TODO: this is n^2 communication complexity
			
 
				-
			
 
				-    if result.size == 1:
			
 
				-      self.buffered_token_output[request_id][0].append(result.item())
			
 
				-      self.trigger_on_token_callbacks(request_id, self.buffered_token_output[request_id][0], is_finished)
			
 
				-
			
 
				-    if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
			
 
				-
			
 
				-    if not is_finished:
			
 
				-      asyncio.create_task(self.forward_to_next_shard(shard, result, request_id, image_str=image_str, inference_state=inference_state))
			
 
				-
			
 
				-    return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
			
 
				+    if DEBUG >= 2: print(f"[{request_id}] process prompt: {base_shard=} {shard=} {prompt=}")
			
 
				+    if not shard.is_first_layer():
			
 
				+      if DEBUG >= 2: print(f"[{request_id}] forwarding to next shard: {base_shard=} {shard=} {prompt=}")
			
 
				+      resp = await self.forward_prompt(shard, prompt, request_id, 0)
			
 
				+      return None
			
 
				+    else:
			
 
				+      result = await self.inference_engine.infer_prompt(request_id, shard, prompt)
			
 
				+      ret = await self.process_inference_result(shard, result, request_id) 
			
 
				+      return result
			
 
				 
			
 
				   async def process_tensor(
			
 
				     self,
			
 
				     base_shard: Shard,
			
 
				     tensor: np.ndarray,
			
 
				     request_id: Optional[str] = None,
			
 
				-    inference_state: Optional[str] = None,
			
 
				   ) -> Optional[np.ndarray]:
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				     asyncio.create_task(
			
@@ -196,12 +207,11 @@ class StandardNode(Node):
 
				           "tensor_size": tensor.size,
			
 
				           "tensor_shape": tensor.shape,
			
 
				           "request_id": request_id,
			
 
				-          "inference_state": inference_state,
			
 
				         }),
			
 
				       )
			
 
				     )
			
 
				     start_time = time.perf_counter_ns()
			
 
				-    resp = await self._process_tensor(shard, tensor, request_id, inference_state)
			
 
				+    resp = await self._process_tensor(shard, tensor, request_id)
			
 
				     end_time = time.perf_counter_ns()
			
 
				     elapsed_time_ns = end_time - start_time
			
 
				     asyncio.create_task(
			
@@ -226,84 +236,77 @@ class StandardNode(Node):
 
				     base_shard: Shard,
			
 
				     tensor: np.ndarray,
			
 
				     request_id: Optional[str] = None,
			
 
				-    inference_state: Optional[str] = None,
			
 
				   ) -> Optional[np.ndarray]:
			
 
				     if request_id is None:
			
 
				       request_id = str(uuid.uuid4())
			
 
				-    if request_id not in self.buffered_token_output:
			
 
				-      self.buffered_token_output[request_id] = ([], False)
			
 
				     shard = self.get_current_shard(base_shard)
			
 
				 
			
 
				+    if DEBUG >= 1: print(f"[{request_id}] process_tensor: {tensor.size=} {tensor.shape=}")
			
 
				     try:
			
 
				-      if DEBUG >= 1: print(f"[{request_id}] process_tensor: {tensor.size=} {tensor.shape=}")
			
 
				-      result, inference_state, is_finished = await self.inference_engine.infer_tensor(request_id, shard, tensor, inference_state=inference_state)
			
 
				-      is_finished = is_finished or len(self.buffered_token_output[request_id][0]) >= self.max_generate_tokens
			
 
				-      if is_finished:
			
 
				-        self.buffered_token_output[request_id] = (self.buffered_token_output[request_id][0], True)
			
 
				-      asyncio.create_task(self.broadcast_result(request_id, self.buffered_token_output[request_id][0], is_finished))  # TODO: this is n^2 communication complexity
			
 
				-
			
 
				-      if result.size == 1:  # we got a new token out
			
 
				-        self.buffered_token_output[request_id][0].append(result.item())
			
 
				-        self.trigger_on_token_callbacks(request_id, self.buffered_token_output[request_id][0], is_finished)
			
 
				-      if DEBUG >= 2: print(f"[{request_id}] result size: {result.size}, is finished: {is_finished}, buffered tokens: {len(self.buffered_token_output[request_id][0])}")
			
 
				-
			
 
				-      if not is_finished:
			
 
				-        asyncio.create_task(self.forward_to_next_shard(shard, result, request_id, inference_state=inference_state))
			
 
				-
			
 
				-      return np.array(self.buffered_token_output[request_id][0]) if len(self.buffered_token_output[request_id][0]) > 0 else None
			
 
				+      result = await self.inference_engine.infer_tensor(request_id, shard, tensor)
			
 
				+      ret = await self.process_inference_result(shard, result, request_id) 
			
 
				+      return ret
			
 
				     except Exception as e:
			
 
				       print(f"Error processing tensor for shard {shard}: {e}")
			
 
				       traceback.print_exc()
			
 
				       return None
			
 
				 
			
 
				-  async def forward_to_next_shard(
			
 
				+  async def forward_prompt(
			
 
				     self,
			
 
				     base_shard: Shard,
			
 
				-    tensor_or_prompt: Union[np.ndarray, str],
			
 
				+    prompt: str,
			
 
				     request_id: str,
			
 
				-    image_str: Optional[str] = None,
			
 
				-    inference_state: Optional[str] = None,
			
 
				+    target_index: int,
			
 
				   ) -> None:
			
 
				+    if DEBUG >= 1: print(f"target partition index: {target_index}")
			
 
				+    target_id = self.partitioning_strategy.partition(self.topology)[target_index].node_id
			
 
				+    next_shard = self.get_current_shard(base_shard, target_index)
			
 
				+    if DEBUG >= 2: print(f"Computed target from: {base_shard} {target_index}, {self.topology}. next shard: {next_shard}")
			
 
				+    if target_id == self.id:
			
 
				+      await self.process_prompt(next_shard, prompt, request_id)
			
 
				+    else:
			
 
				+      target_peer = next((p for p in self.peers if p.id() == target_id), None)
			
 
				+      if not target_peer:
			
 
				+        raise ValueError(f"Peer for {target_index} not found")
			
 
				+      if DEBUG >= 1: print(f"Sending prompt to {target_peer.id()}: {prompt}")
			
 
				+      await target_peer.send_prompt(next_shard, prompt, request_id=request_id)
			
 
				+  
			
 
				+  async def forward_tensor(
			
 
				+    self,
			
 
				+    base_shard: Shard,
			
 
				+    tensor: np.ndarray,
			
 
				+    request_id: str,
			
 
				+    target_index: int,
			
 
				+  ) -> None:
			
 
				+    if DEBUG >= 1: print(f"target partition index: {target_index}")
			
 
				+    target_id = self.partitioning_strategy.partition(self.topology)[target_index].node_id
			
 
				+    next_shard = self.get_current_shard(base_shard, target_index)
			
 
				+    if DEBUG >= 2: print(f"Computed target from: {base_shard} {target_index}, {self.topology}. target shard: {next_shard}")
			
 
				+    if target_id == self.id:
			
 
				+      await self.process_tensor(next_shard, tensor, request_id)
			
 
				+    else:
			
 
				+      target_peer = next((p for p in self.peers if p.id() == target_id), None)
			
 
				+      if not target_peer:
			
 
				+        raise ValueError(f"Peer for {target_index} not found")
			
 
				+      if DEBUG >= 1: print(f"Sending tensor to {target_peer.id()}: {tensor}")
			
 
				+      await target_peer.send_tensor(next_shard, tensor, request_id=request_id)
			
 
				+
			
 
				+  def get_partition_index(self, offset: int = 0):
			
 
				     if not self.partitioning_strategy:
			
 
				       if DEBUG >= 1: print("No partitioning strategy found. Skipping forward.")
			
 
				-      return
			
 
				-    shard = self.get_current_shard(base_shard)
			
 
				-
			
 
				+      return None
			
 
				     partitions = self.partitioning_strategy.partition(self.topology)
			
 
				-    shards = map_partitions_to_shards(self.partitioning_strategy.partition(self.topology), base_shard.n_layers, base_shard.model_id)
			
 
				     current_partition_index = next((i for i, p in enumerate(partitions) if p.node_id == self.id), None)
			
 
				-    if DEBUG >= 1: print(f"Current partition index: {current_partition_index}")
			
 
				-    if current_partition_index is not None:
			
 
				-      next_partition_index = (current_partition_index+1) % len(partitions)
			
 
				-      next_partition: Partition = partitions[next_partition_index]
			
 
				-      next_shard = shards[next_partition_index]
			
 
				-      if DEBUG >= 2: print(f"Computed next from: {shard}, {self.topology}. Next partition: {next_partition}")
			
 
				-
			
 
				-      if next_partition.node_id == self.id:
			
 
				-        if isinstance(tensor_or_prompt, np.ndarray):
			
 
				-          await self.process_tensor(shard, tensor_or_prompt, request_id, inference_state=inference_state)
			
 
				-        else:
			
 
				-          await self.process_prompt(shard, tensor_or_prompt, image_str, request_id, inference_state=inference_state)
			
 
				-        return
			
 
				-
			
 
				-      target_peer = next((p for p in self.peers if p.id() == next_partition.node_id), None)
			
 
				-      if not target_peer:
			
 
				-        raise ValueError(f"Peer for {next_partition} not found")
			
 
				-
			
 
				-      if DEBUG >= 1: print(f"Sending tensor_or_prompt to {target_peer.id()}: {tensor_or_prompt}")
			
 
				-
			
 
				-      if isinstance(tensor_or_prompt, np.ndarray):
			
 
				-        await target_peer.send_tensor(next_shard, tensor_or_prompt, request_id=request_id, inference_state=inference_state)
			
 
				-      else:
			
 
				-        await target_peer.send_prompt(next_shard, tensor_or_prompt, image_str=image_str, request_id=request_id, inference_state=inference_state)
			
 
				+    if current_partition_index is None:
			
 
				+      raise ValueError(f"No current partition found for node: {self.id}")
			
 
				+    return (current_partition_index + offset) % len(partitions)
			
 
				 
			
 
				-  def get_current_shard(self, base_shard: Shard) -> Shard:
			
 
				+  def get_current_shard(self, base_shard: Shard, index: Optional[int] = None) -> Shard:
			
 
				+    if index is None:
			
 
				+      index = self.get_partition_index()
			
 
				     partitions = self.partitioning_strategy.partition(self.topology)
			
 
				     shards = map_partitions_to_shards(partitions, base_shard.n_layers, base_shard.model_id)
			
 
				-    current_partition_index = next((i for i, p in enumerate(partitions) if p.node_id == self.id), None)
			
 
				-    if current_partition_index is None:
			
 
				-      raise ValueError(f"No current partition found for node: {self.id}")
			
 
				-    return shards[current_partition_index]
			
 
				+    return shards[index]
			
 
				 
			
 
				   async def update_peers(self, wait_for_peers: int = 0) -> bool:
			
 
				     next_peers = await self.discovery.discover_peers(wait_for_peers)
			
@@ -311,20 +314,16 @@ class StandardNode(Node):
 
				     next_peer_ids = {peer.id() for peer in next_peers}
			
 
				     peers_added = [peer for peer in next_peers if peer.id() not in current_peer_ids]
			
 
				     peers_removed = [peer for peer in self.peers if peer.id() not in next_peer_ids]
			
 
				-    peers_updated = [
			
 
				-      peer for peer in next_peers
			
 
				-      if peer.id() in current_peer_ids and any(p.addr() != peer.addr() for p in self.peers if p.id() == peer.id())
			
 
				-    ]
			
 
				-    peers_unchanged = [
			
 
				-      peer for peer in next_peers
			
 
				-      if peer.id() in current_peer_ids and all(p.addr() == peer.addr() for p in self.peers if p.id() == peer.id())
			
 
				-    ]
			
 
				+    peers_updated = [peer for peer in next_peers if peer.id() in current_peer_ids and any(p.addr() != peer.addr() for p in self.peers if p.id() == peer.id())]
			
 
				+    peers_unchanged = [peer for peer in next_peers if peer.id() in current_peer_ids and all(p.addr() == peer.addr() for p in self.peers if p.id() == peer.id())]
			
 
				     peers_to_disconnect = [peer for peer in peers_removed if await peer.is_connected()]
			
 
				     peers_to_connect = [peer for peer in peers_added + peers_updated + peers_unchanged if not await peer.is_connected()]
			
 
				 
			
 
				     def _pretty(peers: List[PeerHandle]) -> List[str]:
			
 
				       return [f"{peer.id()}@{peer.addr()}" for peer in peers]
			
 
				-    if DEBUG >= 2: print(f"update_peers: added={peers_added} removed={peers_removed} updated={peers_updated} unchanged={peers_unchanged} to_disconnect={peers_to_disconnect} to_connect={peers_to_connect}")
			
 
				+
			
 
				+    if DEBUG >= 2:
			
 
				+      print(f"update_peers: added={peers_added} removed={peers_removed} updated={peers_updated} unchanged={peers_unchanged} to_disconnect={peers_to_disconnect} to_connect={peers_to_connect}")
			
 
				 
			
 
				     async def disconnect_with_timeout(peer, timeout=5):
			
 
				       try:
			
@@ -344,14 +343,8 @@ class StandardNode(Node):
 
				         traceback.print_exc()
			
 
				         return False
			
 
				 
			
 
				-    disconnect_results = await asyncio.gather(
			
 
				-      *(disconnect_with_timeout(peer) for peer in peers_to_disconnect),
			
 
				-      return_exceptions=True
			
 
				-    )
			
 
				-    connect_results = await asyncio.gather(
			
 
				-      *(connect_with_timeout(peer) for peer in peers_to_connect),
			
 
				-      return_exceptions=True
			
 
				-    )
			
 
				+    disconnect_results = await asyncio.gather(*(disconnect_with_timeout(peer) for peer in peers_to_disconnect), return_exceptions=True)
			
 
				+    connect_results = await asyncio.gather(*(connect_with_timeout(peer) for peer in peers_to_connect), return_exceptions=True)
			
 
				 
			
 
				     successful_disconnects = [peer for peer, result in zip(peers_to_disconnect, disconnect_results) if result is True]
			
 
				     failed_disconnects = [peer for peer, result in zip(peers_to_disconnect, disconnect_results) if result is False]
			
@@ -370,12 +363,7 @@ class StandardNode(Node):
 
				     supported_engines = self.get_supported_inference_engines()
			
 
				     await self.broadcast_supported_engines(supported_engines)
			
 
				     if len(self.get_topology_inference_engines()):
			
 
				-      if any(len(engines) == 1 and "tinygrad" in engines for engines in self.get_topology_inference_engines()):
			
 
				-        if DEBUG >= 1: print("Found node with only tinygrad, using tinygrad on all nodes")
			
 
				-        self.inference_engine = get_inference_engine("tinygrad", self.shard_downloader)
			
 
				-      else:
			
 
				-        if DEBUG >= 1: print("All nodes can use mlx, using mlx for inference")
			
 
				-        self.inference_engine = get_inference_engine("mlx", self.shard_downloader) 
			
 
				+      self.inference_engine = get_inference_engine(supported_engines[0], self.shard_downloader)
			
 
				 
			
 
				   async def periodic_topology_collection(self, interval: int):
			
 
				     while True:
			
@@ -422,6 +410,7 @@ class StandardNode(Node):
 
				         self.topology.merge(other_topology)
			
 
				       except Exception as e:
			
 
				         print(f"Error collecting topology from {peer.id()}: {e}")
			
 
				+        traceback.print_exc()
			
 
				 
			
 
				     next_topology.active_node_id = self.topology.active_node_id  # this is not so clean.
			
 
				     self.topology = next_topology
			
@@ -440,7 +429,7 @@ class StandardNode(Node):
 
				   def trigger_on_token_callbacks(self, request_id: str, tokens: List[int], is_finished: bool) -> None:
			
 
				     if DEBUG >= 2: print(f"Triggering all on_token callbacks with {request_id=} num_tokens={len(tokens)} {is_finished=}")
			
 
				     self.on_token.trigger_all(request_id, tokens, is_finished)
			
 
				-
			
 
				+  
			
 
				   async def broadcast_result(self, request_id: str, result: List[int], is_finished: bool) -> None:
			
 
				     async def send_result_to_peer(peer):
			
 
				       try:
			
@@ -464,6 +453,7 @@ class StandardNode(Node):
 
				       except Exception as e:
			
 
				         print(f"Error sending opaque status to {peer.id()}: {e}")
			
 
				         traceback.print_exc()
			
 
				+
			
 
				     await asyncio.gather(*[send_status_to_peer(peer) for peer in self.peers], return_exceptions=True)
			
 
				     # in the case of opaque status, we also want to receive our own opaque statuses
			
 
				     self.on_opaque_status.trigger_all(request_id, status)
			
--- a/exo/tinychat/index.css
+++ b/exo/tinychat/index.css
@@ -1,31 +1,11 @@
 
				 /* define colors */
			
 
				 :root {
			
 
				-  --primary-color: #a52e4d;
			
 
				-  --primary-color-transparent: #a52e4d66;
			
 
				-  --secondary-color: #228039;
			
 
				-  --secondary-color-transparent: #22803966;
			
 
				-
			
 
				+  --primary-color: #fff;
			
 
				+  --secondary-color: #2a2a2a;
			
 
				+  --secondary-color-transparent: #ffffff66;
			
 
				+  --primary-bg-color: #1a1a1a;
			
 
				+  --foreground-color: #f0f0f0;
			
 
				   --red-color: #a52e4d;
			
 
				-  --green-color: #228039;
			
 
				-  --silver-color: #88808e;
			
 
				-}
			
 
				-@media(prefers-color-scheme: light) {
			
 
				-  :root {
			
 
				-    --primary-bg-color: #f0f0f0;
			
 
				-    --secondary-bg-color: #eeeeee;
			
 
				-    --tertiary-bg-color: #dddddd;
			
 
				-    --foreground-color: #111111;
			
 
				-    --accent-color: #000000;
			
 
				-  }
			
 
				-}
			
 
				-@media(prefers-color-scheme: dark) {
			
 
				-  :root {
			
 
				-    --primary-bg-color: #111111;
			
 
				-    --secondary-bg-color: #131313;
			
 
				-    --tertiary-bg-color: #232323;
			
 
				-    --foreground-color: #f0f0f0;
			
 
				-    --accent-color: #aaaaaa;
			
 
				-  }
			
 
				 }
			
 
				 
			
 
				 main {
			
@@ -81,7 +61,11 @@ main {
 
				   top: 0;
			
 
				   position: absolute;
			
 
				 
			
 
				-  background: linear-gradient(180deg, var(--primary-bg-color) 0%, transparent 100%);
			
 
				+  background: linear-gradient(
			
 
				+    180deg,
			
 
				+    var(--primary-bg-color) 0%,
			
 
				+    transparent 100%
			
 
				+  );
			
 
				 }
			
 
				 .histories-end {
			
 
				   height: 3rem;
			
@@ -91,7 +75,11 @@ main {
 
				   bottom: 0;
			
 
				   position: absolute;
			
 
				 
			
 
				-  background: linear-gradient(0deg, var(--primary-bg-color) 0%, transparent 100%);
			
 
				+  background: linear-gradient(
			
 
				+    0deg,
			
 
				+    var(--primary-bg-color) 0%,
			
 
				+    transparent 100%
			
 
				+  );
			
 
				 }
			
 
				 
			
 
				 .history {
			
@@ -99,7 +87,7 @@ main {
 
				   width: 100%;
			
 
				   max-width: 40rem;
			
 
				 
			
 
				-  background-color: var(--tertiary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   border-radius: 10px;
			
 
				   border-left: 2px solid var(--primary-color);
			
 
				 
			
@@ -109,7 +97,7 @@ main {
 
				   opacity: var(--opacity, 1);
			
 
				 }
			
 
				 .history:hover {
			
 
				-  background-color: var(--secondary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				 }
			
 
				 
			
 
				 .history-delete-button {
			
@@ -120,14 +108,14 @@ main {
 
				   margin: 0;
			
 
				   outline: none;
			
 
				   border: none;
			
 
				-  background-color: var(--secondary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   color: var(--foreground-color);
			
 
				   border-radius: 0 0 0 10px;
			
 
				   cursor: pointer;
			
 
				   transition: 0.2s;
			
 
				 }
			
 
				 .history-delete-button:hover {
			
 
				-  background-color: var(--tertiary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   padding: 0.75rem;
			
 
				 }
			
 
				 
			
@@ -135,6 +123,7 @@ main {
 
				   overflow-y: auto;
			
 
				   height: 100%;
			
 
				   width: 100%;
			
 
				+  max-width: 1200px;
			
 
				 
			
 
				   display: flex;
			
 
				   flex-direction: column;
			
@@ -145,28 +134,25 @@ main {
 
				 }
			
 
				 
			
 
				 .message {
			
 
				-  width: 96%;
			
 
				-  max-width: 80rem;
			
 
				-
			
 
				-  display: grid;
			
 
				-
			
 
				-  background-color: var(--secondary-bg-color);
			
 
				+  max-width: 75%;
			
 
				   padding: 0.5rem 1rem;
			
 
				-  border-radius: 10px;
			
 
				+  border-radius: 20px;
			
 
				 }
			
 
				 .message-role-assistant {
			
 
				-  border-bottom: 2px solid var(--primary-color);
			
 
				-  border-left: 2px solid var(--primary-color);
			
 
				-  box-shadow: -10px 10px 20px 2px var(--primary-color-transparent);
			
 
				+  background-color: var(--secondary-color);
			
 
				+  margin-right: auto;
			
 
				+  color: #fff;
			
 
				 }
			
 
				 .message-role-user {
			
 
				-  border-bottom: 2px solid var(--secondary-color);
			
 
				-  border-right: 2px solid var(--secondary-color);
			
 
				-  box-shadow: 10px 10px 20px 2px var(--secondary-color-transparent);
			
 
				+  margin-left: auto;
			
 
				+  background-color: var(--primary-color);
			
 
				+  color: #000;
			
 
				 }
			
 
				 .download-progress {
			
 
				   margin-bottom: 12em;
			
 
				   overflow-y: auto;
			
 
				+  min-height: 350px;
			
 
				+  padding: 2rem;
			
 
				 }
			
 
				 .message > pre {
			
 
				   white-space: pre-wrap;
			
@@ -191,17 +177,70 @@ main {
 
				 }
			
 
				 
			
 
				 .toast {
			
 
				-    width: 100%; /* Take up the full width of the page */
			
 
				-    background-color: #fc2a2a; /* Dark background color */
			
 
				-    color: #fff; /* White text color */
			
 
				-    text-align: center; /* Centered text */
			
 
				-    border-radius: 2px; /* Rounded borders */
			
 
				-    padding: 16px; /* Padding */
			
 
				-    position: fixed; /* Sit on top of the screen content */
			
 
				-    z-index: 9999; /* Add a z-index if needed */
			
 
				-    top: 0; /* Position at the top of the page */
			
 
				-    left: 0; /* Extend from the left edge */
			
 
				-    right: 0; /* Extend to the right edge */
			
 
				+    width: 100%;
			
 
				+    background-color: #fc2a2a;
			
 
				+    color: #fff;
			
 
				+    text-align: left;
			
 
				+    border-radius: 2px;
			
 
				+    padding: 16px;
			
 
				+    position: fixed;
			
 
				+    z-index: 9999;
			
 
				+    top: 0;
			
 
				+    left: 0;
			
 
				+    right: 0;
			
 
				+    display: flex;
			
 
				+    flex-direction: column;
			
 
				+    white-space: pre-wrap;
			
 
				+    font-family: monospace;
			
 
				+}
			
 
				+
			
 
				+.toast-header {
			
 
				+    display: flex;
			
 
				+    justify-content: space-between;
			
 
				+    align-items: center;
			
 
				+    width: 100%;
			
 
				+}
			
 
				+
			
 
				+.toast-error-message {
			
 
				+    flex-grow: 1;
			
 
				+}
			
 
				+
			
 
				+.toast-header-buttons {
			
 
				+    display: flex;
			
 
				+    align-items: center;
			
 
				+    gap: 16px;
			
 
				+    margin-left: 24px;
			
 
				+}
			
 
				+
			
 
				+.toast-expand-button {
			
 
				+    background: none;
			
 
				+    border: none;
			
 
				+    color: white;
			
 
				+    padding: 4px;
			
 
				+    cursor: pointer;
			
 
				+    font-size: 1em;
			
 
				+}
			
 
				+
			
 
				+.toast-close-button {
			
 
				+    background: none;
			
 
				+    border: none;
			
 
				+    color: white;
			
 
				+    padding: 4px;
			
 
				+    cursor: pointer;
			
 
				+    font-size: 1.2em;
			
 
				+    line-height: 1;
			
 
				+}
			
 
				+
			
 
				+.toast-expand-button:hover,
			
 
				+.toast-close-button:hover {
			
 
				+    opacity: 0.8;
			
 
				+}
			
 
				+
			
 
				+.toast-content {
			
 
				+    margin-top: 10px;
			
 
				+    padding: 10px;
			
 
				+    background-color: rgba(0, 0, 0, 0.2);
			
 
				+    border-radius: 4px;
			
 
				 }
			
 
				 
			
 
				 .hljs {
			
@@ -220,14 +259,14 @@ main {
 
				   margin: 0;
			
 
				   outline: none;
			
 
				   border: none;
			
 
				-  background-color: var(--secondary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   color: var(--foreground-color);
			
 
				   border-radius: 0 0 0 10px;
			
 
				   cursor: pointer;
			
 
				   transition: 0.2s;
			
 
				 }
			
 
				 .clipboard-button:hover {
			
 
				-  background-color: var(--tertiary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   padding: 0.75rem;
			
 
				 }
			
 
				 
			
@@ -236,9 +275,14 @@ main {
 
				   bottom: 0;
			
 
				 
			
 
				   /* linear gradient from background-color to transparent on the top */
			
 
				-  background: linear-gradient(0deg, var(--primary-bg-color) 55%, transparent 100%);
			
 
				+  background: linear-gradient(
			
 
				+    0deg,
			
 
				+    var(--primary-bg-color) 55%,
			
 
				+    transparent 100%
			
 
				+  );
			
 
				 
			
 
				   width: 100%;
			
 
				+  max-width: 1200px;
			
 
				   display: flex;
			
 
				   flex-direction: column;
			
 
				   justify-content: center;
			
@@ -285,7 +329,7 @@ main {
 
				   min-height: 3rem;
			
 
				   max-height: 8rem;
			
 
				 
			
 
				-  background-color: var(--tertiary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   color: var(--foreground-color);
			
 
				   border-radius: 10px;
			
 
				   border: none;
			
@@ -297,8 +341,8 @@ main {
 
				   height: 3rem;
			
 
				   width: 4rem;
			
 
				 
			
 
				-  background-color: var(--secondary-color);
			
 
				-  color: var(--foreground-color);
			
 
				+  background-color: var(--primary-color);
			
 
				+  color: var(--secondary-color);
			
 
				   border-radius: 10px;
			
 
				   padding: 0.5rem;
			
 
				   cursor: pointer;
			
@@ -307,7 +351,7 @@ main {
 
				   background-color: var(--secondary-color-transparent);
			
 
				 }
			
 
				 .input-button:disabled {
			
 
				-  background-color: var(--secondary-bg-color);
			
 
				+  background-color: var(--secondary-color);
			
 
				   cursor: not-allowed;
			
 
				 }
			
 
				 
			
@@ -414,4 +458,27 @@ p {
 
				   max-width: 100%;
			
 
				   max-height: 100%;
			
 
				   object-fit: contain;
			
 
				+}
			
 
				+
			
 
				+.clear-history-button {
			
 
				+  background-color: var(--red-color);
			
 
				+  color: white;
			
 
				+  padding: 10px 20px;
			
 
				+  border-radius: 5px;
			
 
				+  display: flex;
			
 
				+  align-items: center;
			
 
				+  gap: 8px;
			
 
				+  transition: all 0.3s ease;
			
 
				+  margin: 1rem auto;
			
 
				+  border: none;
			
 
				+  cursor: pointer;
			
 
				+}
			
 
				+
			
 
				+.clear-history-button:hover {
			
 
				+  opacity: 0.8;
			
 
				+  transform: scale(1.05);
			
 
				+}
			
 
				+
			
 
				+.clear-history-button i {
			
 
				+  font-size: 14px;
			
 
				 }
			
--- a/exo/tinychat/index.html
+++ b/exo/tinychat/index.html
@@ -26,33 +26,27 @@
 
				 <body>
			
 
				 <main x-data="state" x-init="console.log(endpoint)">
			
 
				      <!-- Error Toast -->
			
 
				-    <div x-show="errorMessage" x-transition.opacity x-text="errorMessage" class="toast">
			
 
				+    <div x-show="errorMessage" x-transition.opacity class="toast">
			
 
				+        <div class="toast-header">
			
 
				+            <span class="toast-error-message" x-text="errorMessage.basic"></span>
			
 
				+            <div class="toast-header-buttons">
			
 
				+                <button @click="errorExpanded = !errorExpanded; if (errorTimeout) { clearTimeout(errorTimeout); errorTimeout = null; }" 
			
 
				+                        class="toast-expand-button" 
			
 
				+                        x-show="errorMessage.stack">
			
 
				+                    <span x-text="errorExpanded ? 'Hide Details' : 'Show Details'"></span>
			
 
				+                </button>
			
 
				+                <button @click="errorMessage = null; errorExpanded = false;" class="toast-close-button">
			
 
				+                    <i class="fas fa-times"></i>
			
 
				+                </button>
			
 
				+            </div>
			
 
				+        </div>
			
 
				+        <div class="toast-content" x-show="errorExpanded" x-transition>
			
 
				+            <span x-text="errorMessage.stack"></span>
			
 
				+        </div>
			
 
				     </div>
			
 
				 <div class="model-selector">
			
 
				-<select @change="if (cstate) cstate.selectedModel = $event.target.value" x-model="cstate.selectedModel">
			
 
				-<option selected="" value="llama-3.2-1b">Llama 3.2 1B</option>
			
 
				-<option value="llama-3.2-3b">Llama 3.2 3B</option>
			
 
				-<option value="llama-3.1-8b">Llama 3.1 8B</option>
			
 
				-<option value="llama-3.1-70b">Llama 3.1 70B</option>
			
 
				-<option value="llama-3.1-70b-bf16">Llama 3.1 70B (BF16)</option>
			
 
				-<option value="llama-3.1-405b">Llama 3.1 405B</option>
			
 
				-<option value="llama-3-8b">Llama 3 8B</option>
			
 
				-<option value="llama-3-70b">Llama 3 70B</option>
			
 
				-<option value="nemotron-70b">Nemotron 70B</option>
			
 
				-<option value="nemotron-70b-bf16">Nemotron 70B (BF16)</option>
			
 
				-<option value="mistral-nemo">Mistral Nemo</option>
			
 
				-<option value="mistral-large">Mistral Large</option>
			
 
				-<option value="deepseek-coder-v2-lite">Deepseek Coder V2 Lite</option>
			
 
				-<option value="deepseek-coder-v2.5">Deepseek Coder V2.5</option>
			
 
				-<option value="llava-1.5-7b-hf">LLaVa 1.5 7B (Vision Model)</option>
			
 
				-<option value="qwen-2.5-coder-1.5b">Qwen 2.5 Coder 1.5B</option>
			
 
				-<option value="qwen-2.5-coder-7b">Qwen 2.5 Coder 7B</option>
			
 
				-<option value="qwen-2.5-7b">Qwen 2.5 7B</option>
			
 
				-<option value="qwen-2.5-math-7b">Qwen 2.5 7B (Math)</option>
			
 
				-<option value="qwen-2.5-14b">Qwen 2.5 14B</option>
			
 
				-<option value="qwen-2.5-72b">Qwen 2.5 72B</option>
			
 
				-<option value="qwen-2.5-math-72b">Qwen 2.5 72B (Math)</option>
			
 
				-</select>
			
 
				+  <select @change="if (cstate) cstate.selectedModel = $event.target.value" x-model="cstate.selectedModel" x-init="await populateSelector()" class='model-select'>
			
 
				+  </select>
			
 
				 </div>
			
 
				 <div @popstate.window="
			
 
				       if (home === 2) {
			
@@ -68,6 +62,13 @@
 
				       if (home === -1) setTimeout(() =&gt; home = 0, 100);
			
 
				     " x-show="home === 0" x-transition="">
			
 
				 <h1 class="title megrim-regular">tinychat</h1>
			
 
				+<template x-if="histories.length">
			
 
				+  <button 
			
 
				+    @click="if(confirm('Are you sure you want to clear all history?')) clearAllHistory();" 
			
 
				+    class="clear-history-button">
			
 
				+    <i class="fas fa-trash"></i> Clear All History
			
 
				+  </button>
			
 
				+</template>
			
 
				 <div class="histories-container-container">
			
 
				 <template x-if="histories.length">
			
 
				 <div class="histories-start"></div>
			
--- a/exo/tinychat/index.js
+++ b/exo/tinychat/index.js
@@ -4,8 +4,8 @@ document.addEventListener("alpine:init", () => {
 
				     cstate: {
			
 
				       time: null,
			
 
				       messages: [],
			
 
				-      selectedModel: 'llama-3.1-8b',
			
 
				-    },
			
 
				+      selectedModel: 'llama-3.2-1b',
			
 
				+    },    
			
 
				 
			
 
				     // historical state
			
 
				     histories: JSON.parse(localStorage.getItem("histories")) || [],
			
@@ -14,6 +14,8 @@ document.addEventListener("alpine:init", () => {
 
				     generating: false,
			
 
				     endpoint: `${window.location.origin}/v1`,
			
 
				     errorMessage: null,
			
 
				+    errorExpanded: false,
			
 
				+    errorTimeout: null,
			
 
				 
			
 
				     // performance tracking
			
 
				     time_till_first: 0,
			
@@ -47,6 +49,12 @@ document.addEventListener("alpine:init", () => {
 
				         localStorage.setItem("histories", JSON.stringify(this.histories));
			
 
				       }
			
 
				     },
			
 
				+
			
 
				+    clearAllHistory() {
			
 
				+      this.histories = [];
			
 
				+      localStorage.setItem("histories", JSON.stringify([]));
			
 
				+    },
			
 
				+
			
 
				     // Utility functions
			
 
				     formatBytes(bytes) {
			
 
				       if (bytes === 0) return '0 B';
			
@@ -66,6 +74,56 @@ document.addEventListener("alpine:init", () => {
 
				       return `${s}s`;
			
 
				     },
			
 
				 
			
 
				+    async populateSelector() {
			
 
				+      try {
			
 
				+        const response = await fetch(`${window.location.origin}/modelpool`);
			
 
				+        const responseText = await response.text(); // Get raw response text first
			
 
				+        
			
 
				+        if (!response.ok) {
			
 
				+          throw new Error(`HTTP error! status: ${response.status}`);
			
 
				+        }
			
 
				+        
			
 
				+        // Try to parse the response text
			
 
				+        let responseJson;
			
 
				+        try {
			
 
				+          responseJson = JSON.parse(responseText);
			
 
				+        } catch (parseError) {
			
 
				+          console.error('Failed to parse JSON:', parseError);
			
 
				+          throw new Error(`Invalid JSON response: ${responseText}`);
			
 
				+        }
			
 
				+
			
 
				+        const sel = document.querySelector(".model-select");
			
 
				+        if (!sel) {
			
 
				+          throw new Error("Could not find model selector element");
			
 
				+        }
			
 
				+
			
 
				+        // Clear the current options and add new ones
			
 
				+        sel.innerHTML = '';
			
 
				+          
			
 
				+        const modelDict = responseJson["model pool"];
			
 
				+        if (!modelDict) {
			
 
				+          throw new Error("Response missing 'model pool' property");
			
 
				+        }
			
 
				+
			
 
				+        Object.entries(modelDict).forEach(([key, value]) => {
			
 
				+          const opt = document.createElement("option");
			
 
				+          opt.value = key;
			
 
				+          opt.textContent = value;
			
 
				+          sel.appendChild(opt);
			
 
				+        });
			
 
				+
			
 
				+        // Set initial value to the first model
			
 
				+        const firstKey = Object.keys(modelDict)[0];
			
 
				+        if (firstKey) {
			
 
				+          sel.value = firstKey;
			
 
				+          this.cstate.selectedModel = firstKey;
			
 
				+        }
			
 
				+      } catch (error) {
			
 
				+        console.error("Error populating model selector:", error);
			
 
				+        this.errorMessage = `Failed to load models: ${error.message}`;
			
 
				+      }
			
 
				+    },
			
 
				+
			
 
				     async handleImageUpload(event) {
			
 
				       const file = event.target.files[0];
			
 
				       if (file) {
			
@@ -110,12 +168,30 @@ document.addEventListener("alpine:init", () => {
 
				         localStorage.setItem("pendingMessage", value);
			
 
				         this.processMessage(value);
			
 
				       } catch (error) {
			
 
				-        console.error('error', error)
			
 
				-        this.lastErrorMessage = error.message || 'Unknown error on handleSend';
			
 
				-        this.errorMessage = error.message || 'Unknown error on handleSend';
			
 
				-        setTimeout(() => {
			
 
				-          this.errorMessage = null;
			
 
				-        }, 5 * 1000)
			
 
				+        console.error('error', error);
			
 
				+        const errorDetails = {
			
 
				+            message: error.message || 'Unknown error',
			
 
				+            stack: error.stack,
			
 
				+            name: error.name || 'Error'
			
 
				+        };
			
 
				+        
			
 
				+        this.errorMessage = {
			
 
				+            basic: `${errorDetails.name}: ${errorDetails.message}`,
			
 
				+            stack: errorDetails.stack
			
 
				+        };
			
 
				+
			
 
				+        // Clear any existing timeout
			
 
				+        if (this.errorTimeout) {
			
 
				+            clearTimeout(this.errorTimeout);
			
 
				+        }
			
 
				+
			
 
				+        // Only set the timeout if the error details aren't expanded
			
 
				+        if (!this.errorExpanded) {
			
 
				+            this.errorTimeout = setTimeout(() => {
			
 
				+                this.errorMessage = null;
			
 
				+                this.errorExpanded = false;
			
 
				+            }, 30 * 1000);
			
 
				+        }
			
 
				         this.generating = false;
			
 
				       }
			
 
				     },
			
@@ -232,12 +308,30 @@ document.addEventListener("alpine:init", () => {
 
				           console.error("Failed to save histories to localStorage:", error);
			
 
				         }
			
 
				       } catch (error) {
			
 
				-        console.error('error', error)
			
 
				-        this.lastErrorMessage = error;
			
 
				-        this.errorMessage = error;
			
 
				-        setTimeout(() => {
			
 
				-          this.errorMessage = null;
			
 
				-        }, 5 * 1000)
			
 
				+        console.error('error', error);
			
 
				+        const errorDetails = {
			
 
				+            message: error.message || 'Unknown error',
			
 
				+            stack: error.stack,
			
 
				+            name: error.name || 'Error'
			
 
				+        };
			
 
				+        
			
 
				+        this.errorMessage = {
			
 
				+            basic: `${errorDetails.name}: ${errorDetails.message}`,
			
 
				+            stack: errorDetails.stack
			
 
				+        };
			
 
				+
			
 
				+        // Clear any existing timeout
			
 
				+        if (this.errorTimeout) {
			
 
				+            clearTimeout(this.errorTimeout);
			
 
				+        }
			
 
				+
			
 
				+        // Only set the timeout if the error details aren't expanded
			
 
				+        if (!this.errorExpanded) {
			
 
				+            this.errorTimeout = setTimeout(() => {
			
 
				+                this.errorMessage = null;
			
 
				+                this.errorExpanded = false;
			
 
				+            }, 30 * 1000);
			
 
				+        }
			
 
				       } finally {
			
 
				         this.generating = false;
			
 
				       }
			
@@ -529,6 +623,7 @@ function createParser(onParse) {
 
				     }
			
 
				   }
			
 
				 }
			
 
				+
			
 
				 const BOM = [239, 187, 191];
			
 
				 function hasBom(buffer) {
			
 
				   return BOM.every((charCode, index) => buffer.charCodeAt(index) === charCode);
			
--- a/exo/tinychat/update_deps.py
+++ b/exo/tinychat/update_deps.py
@@ -4,49 +4,52 @@ from bs4 import BeautifulSoup
 
				 from urllib.parse import urljoin, urlparse
			
 
				 import re
			
 
				 
			
 
				+
			
 
				 def download_file(url, local_path):
			
 
				-    response = requests.get(url)
			
 
				-    if response.status_code == 200:
			
 
				-        os.makedirs(os.path.dirname(local_path), exist_ok=True)
			
 
				-        with open(local_path, 'wb') as f:
			
 
				-            f.write(response.content)
			
 
				-        print(f"Downloaded: {local_path}")
			
 
				-    else:
			
 
				-        print(response.status_code)
			
 
				-        print(f"Failed to download: {url}")
			
 
				+  response = requests.get(url)
			
 
				+  if response.status_code == 200:
			
 
				+    os.makedirs(os.path.dirname(local_path), exist_ok=True)
			
 
				+    with open(local_path, 'wb') as f:
			
 
				+      f.write(response.content)
			
 
				+    print(f"Downloaded: {local_path}")
			
 
				+  else:
			
 
				+    print(response.status_code)
			
 
				+    print(f"Failed to download: {url}")
			
 
				+
			
 
				 
			
 
				 def update_html(html_content, base_url):
			
 
				-    soup = BeautifulSoup(html_content, 'html.parser')
			
 
				+  soup = BeautifulSoup(html_content, 'html.parser')
			
 
				 
			
 
				-    for tag in soup.find_all(['script', 'link']):
			
 
				-        if tag.has_attr('src'):
			
 
				-            url = tag['src']
			
 
				-        elif tag.has_attr('href'):
			
 
				-            url = tag['href']
			
 
				-        else:
			
 
				-            continue
			
 
				+  for tag in soup.find_all(['script', 'link']):
			
 
				+    if tag.has_attr('src'):
			
 
				+      url = tag['src']
			
 
				+    elif tag.has_attr('href'):
			
 
				+      url = tag['href']
			
 
				+    else:
			
 
				+      continue
			
 
				+
			
 
				+    if url.startswith(('http://', 'https://')):
			
 
				+      full_url = url
			
 
				+    else:
			
 
				+      full_url = urljoin(base_url, url)
			
 
				 
			
 
				-        if url.startswith(('http://', 'https://')):
			
 
				-            full_url = url
			
 
				-        else:
			
 
				-            full_url = urljoin(base_url, url)
			
 
				+    parsed_url = urlparse(full_url)
			
 
				+    local_path = os.path.join('static', parsed_url.netloc, parsed_url.path.lstrip('/'))
			
 
				 
			
 
				-        parsed_url = urlparse(full_url)
			
 
				-        local_path = os.path.join('static', parsed_url.netloc, parsed_url.path.lstrip('/'))
			
 
				+    download_file(full_url, local_path)
			
 
				 
			
 
				-        download_file(full_url, local_path)
			
 
				+    relative_path = os.path.relpath(local_path, '.')
			
 
				+    if tag.name == 'script':
			
 
				+      tag['src'] = "/" + relative_path
			
 
				+    elif tag.name == 'link':
			
 
				+      tag['href'] = "/" + relative_path
			
 
				 
			
 
				-        relative_path = os.path.relpath(local_path, '.')
			
 
				-        if tag.name == 'script':
			
 
				-            tag['src'] = "/" + relative_path
			
 
				-        elif tag.name == 'link':
			
 
				-            tag['href'] = "/" + relative_path
			
 
				+  return str(soup)
			
 
				 
			
 
				-    return str(soup)
			
 
				 
			
 
				 # Read the HTML file
			
 
				 with open('./index.html', 'r') as f:
			
 
				-    html_content = f.read()
			
 
				+  html_content = f.read()
			
 
				 
			
 
				 # Update HTML and download files
			
 
				 # updated_html = update_html(html_content, 'https://example.com')
			
@@ -68,7 +71,7 @@ download_file(css_url, css_output_path)
 
				 
			
 
				 # Parse CSS file for font URLs
			
 
				 with open(css_output_path, 'r', encoding='utf-8') as f:
			
 
				-    css_content = f.read()
			
 
				+  css_content = f.read()
			
 
				 
			
 
				 # Extract font URLs from the CSS content
			
 
				 font_urls = re.findall(r'url\((.*?\.(?:woff2|ttf))\)', css_content)
			
@@ -77,14 +80,14 @@ print(f"Found {len(font_urls)} font URLs")
 
				 
			
 
				 # Download font files
			
 
				 for font_url in font_urls:
			
 
				-    font_url = font_url.strip('"\'')
			
 
				-    if font_url.startswith('../'):
			
 
				-        font_url = font_url[3:]
			
 
				+  font_url = font_url.strip('"\'')
			
 
				+  if font_url.startswith('../'):
			
 
				+    font_url = font_url[3:]
			
 
				 
			
 
				-    # Use base_url instead of urljoin to keep the version number
			
 
				-    full_url = base_url + font_url
			
 
				-    relative_path = font_url
			
 
				-    output_path = os.path.join(output_dir, relative_path)
			
 
				-    download_file(full_url, output_path)
			
 
				+  # Use base_url instead of urljoin to keep the version number
			
 
				+  full_url = base_url + font_url
			
 
				+  relative_path = font_url
			
 
				+  output_path = os.path.join(output_dir, relative_path)
			
 
				+  download_file(full_url, output_path)
			
 
				 
			
 
				-print("Download complete!")
			
 
				+print("Download complete!")
			
--- a/exo/topology/device_capabilities.py
+++ b/exo/topology/device_capabilities.py
@@ -52,9 +52,11 @@ CHIP_FLOPS = {
 
				   "Apple M2 Max": DeviceFlops(fp32=13.49*TFLOPS, fp16=26.98*TFLOPS, int8=53.96*TFLOPS),
			
 
				   "Apple M2 Ultra": DeviceFlops(fp32=26.98*TFLOPS, fp16=53.96*TFLOPS, int8=107.92*TFLOPS),
			
 
				   "Apple M3": DeviceFlops(fp32=3.55*TFLOPS, fp16=7.10*TFLOPS, int8=14.20*TFLOPS),
			
 
				-  "Apple M3 Max": DeviceFlops(fp32=14.20*TFLOPS, fp16=28.40*TFLOPS, int8=56.80*TFLOPS),
			
 
				   "Apple M3 Pro": DeviceFlops(fp32=4.97*TFLOPS, fp16=9.94*TFLOPS, int8=19.88*TFLOPS),
			
 
				-  "Apple M4": DeviceFlops(fp32=3.55*TFLOPS, fp16=7.10*TFLOPS, int8=14.20*TFLOPS),
			
 
				+  "Apple M3 Max": DeviceFlops(fp32=14.20*TFLOPS, fp16=28.40*TFLOPS, int8=56.80*TFLOPS),
			
 
				+  "Apple M4": DeviceFlops(fp32=4.26*TFLOPS, fp16=8.52*TFLOPS, int8=17.04*TFLOPS),
			
 
				+  "Apple M4 Pro": DeviceFlops(fp32=5.72*TFLOPS, fp16=11.44*TFLOPS, int8=22.88*TFLOPS),
			
 
				+  "Apple M4 Max": DeviceFlops(fp32=18.03*TFLOPS, fp16=36.07*TFLOPS, int8=72.14*TFLOPS),
			
 
				   ### A chips
			
 
				   "Apple A13 Bionic": DeviceFlops(fp32=0.69*TFLOPS, fp16=1.38*TFLOPS, int8=2.76*TFLOPS),
			
 
				   "Apple A14 Bionic": DeviceFlops(fp32=0.75*TFLOPS, fp16=1.50*TFLOPS, int8=3.00*TFLOPS),
			
--- a/extra/start_openwebui.sh
+++ b/extra/start_openwebui.sh
@@ -1,3 +1,3 @@
 
				-API_ENDPOINT="http://${API_ENDPOINT:-$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | awk '{print $2}' | head -n 1):8000}"
			
 
				+API_ENDPOINT="http://${API_ENDPOINT:-$(ifconfig | grep 'inet ' | grep -v '127.0.0.1' | awk '{print $2}' | head -n 1):52415}"
			
 
				 echo "Using API_ENDPOINT=${API_ENDPOINT}"
			
 
				 docker run -d -p 3000:8080 -e OPENAI_API_BASE_URL="${API_ENDPOINT}" -e OPENAI_API_KEY=your_secret_key -v open-webui:/app/backend/data --name open-webui --restart always ghcr.io/open-webui/open-webui:main
			
--- a/format.py
+++ b/format.py
@@ -21,7 +21,7 @@ def run_yapf(target):
 
				 
			
 
				 def main():
			
 
				   if len(sys.argv) < 2:
			
 
				-    print("Usage: python format.py <directory_or_file>")
			
 
				+    print("Usage: python3 format.py <directory_or_file> e.g. python3 format.py ./exo")
			
 
				     sys.exit(1)
			
 
				 
			
 
				   target = sys.argv[1]
			
--- a/lint.sh
+++ b/lint.sh
@@ -1,5 +0,0 @@
 
				-#!/bin/bash
			
 
				-
			
 
				-pip3 install -e '.[linting]'
			
 
				-python3 -m ruff check .
			
 
				-python3 -m pylint .
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +0,0 @@
 
				-[tool.pylint.format]
			
 
				-indent-string = '  '
			
 
				-max-line-length = 200
			
 
				-
			
 
				-[tool.autopep8]
			
 
				-max_line_length = 200
			
 
				-indent_size = 2
			
--- a/ruff.toml
+++ b/ruff.toml
@@ -1,43 +0,0 @@
 
				-indent-width = 2
			
 
				-preview = true
			
 
				-target-version = "py312"
			
 
				-
			
 
				-lint.select = [
			
 
				-  "F",  # Pyflakes
			
 
				-  "W6",
			
 
				-  "E71",
			
 
				-  "E72",
			
 
				-  "E112",   # no-indented-block
			
 
				-  "E113",   # unexpected-indentation
			
 
				-  # "E124",
			
 
				-  "E203",   # whitespace-before-punctuation
			
 
				-  "E272",   # multiple-spaces-before-keyword
			
 
				-  "E303",   # too-many-blank-lines
			
 
				-  "E304",   # blank-line-after-decorator
			
 
				-  "E501",   # line-too-long
			
 
				-  # "E502",
			
 
				-  "E702",   # multiple-statements-on-one-line-semicolon
			
 
				-  "E703",   # useless-semicolon
			
 
				-  "E731",   # lambda-assignment
			
 
				-  "W191",   # tab-indentation
			
 
				-  "W291",   # trailing-whitespace
			
 
				-  "W293",   # blank-line-with-whitespace
			
 
				-  "UP039",  # unnecessary-class-parentheses
			
 
				-  "C416",   # unnecessary-comprehension
			
 
				-  "RET506", # superfluous-else-raise
			
 
				-  "RET507", # superfluous-else-continue
			
 
				-  "A",      # builtin-variable-shadowing, builtin-argument-shadowing, builtin-attribute-shadowing
			
 
				-  "SIM105", # suppressible-exception
			
 
				-  "FURB110",# if-exp-instead-of-or-operator
			
 
				-]
			
 
				-
			
 
				-line-length = 200
			
 
				-
			
 
				-exclude = [
			
 
				-  "docs/",
			
 
				-  "examples/",
			
 
				-  "extra/",
			
 
				-  "exo/networking/grpc/node_service_pb2.py",
			
 
				-  "exo/networking/grpc/node_service_pb2_grpc.py",
			
 
				-  "exo/helpers.py",
			
 
				-]
			
--- a/scripts/build_exo.py
+++ b/scripts/build_exo.py
@@ -0,0 +1,60 @@
 
				+import site
			
 
				+import subprocess
			
 
				+import sys
			
 
				+import os 
			
 
				+import pkgutil
			
 
				+
			
 
				+def run():
			
 
				+    site_packages = site.getsitepackages()[0]
			
 
				+    command = [
			
 
				+        f"{sys.executable}", "-m", "nuitka", "exo/main.py",
			
 
				+        "--company-name=exolabs",
			
 
				+        "--product-name=exo",
			
 
				+        "--output-dir=dist",
			
 
				+        "--follow-imports",
			
 
				+        "--standalone",
			
 
				+        "--output-filename=exo",
			
 
				+        "--onefile",
			
 
				+        "--python-flag=no_site"
			
 
				+    ]
			
 
				+
			
 
				+    if sys.platform == "darwin": 
			
 
				+        command.extend([
			
 
				+            "--macos-app-name=exo",
			
 
				+            "--macos-app-mode=gui",
			
 
				+            "--macos-app-version=0.0.1",
			
 
				+            "--macos-signed-app-name=com.exolabs.exo",
			
 
				+            "--macos-sign-identity=auto",
			
 
				+            "--macos-sign-notarization",
			
 
				+            "--include-distribution-meta=mlx",
			
 
				+            "--include-module=mlx._reprlib_fix",
			
 
				+            "--include-module=mlx._os_warning",
			
 
				+            f"--include-data-files={site_packages}/mlx/lib/mlx.metallib=mlx/lib/mlx.metallib",
			
 
				+            f"--include-data-files={site_packages}/mlx/lib/mlx.metallib=./mlx.metallib",
			
 
				+            "--include-distribution-meta=pygments",
			
 
				+            "--nofollow-import-to=tinygrad"
			
 
				+        ])
			
 
				+        inference_modules = [
			
 
				+            name for _, name, _ in pkgutil.iter_modules(['exo/inference/mlx/models'])
			
 
				+        ]
			
 
				+        for module in inference_modules:
			
 
				+            command.append(f"--include-module=exo.inference.mlx.models.{module}")
			
 
				+    elif sys.platform == "win32":  
			
 
				+        command.extend([
			
 
				+            "--windows-icon-from-ico=docs/exo-logo-win.ico",
			
 
				+            "--file-version=0.0.1",
			
 
				+            "--product-version=0.0.1"
			
 
				+        ])
			
 
				+    elif sys.platform.startswith("linux"):  
			
 
				+        command.extend([
			
 
				+            "--include-distribution-metadata=pygments",
			
 
				+            "--linux-icon=docs/exo-rounded.png"
			
 
				+        ])
			
 
				+    try:
			
 
				+        subprocess.run(command, check=True)
			
 
				+        print("Build completed!")
			
 
				+    except subprocess.CalledProcessError as e:
			
 
				+        print(f"An error occurred: {e}")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    run()
			
--- a/scripts/compile_grpc.sh
+++ b/scripts/compile_grpc.sh
@@ -0,0 +1,7 @@
 
				+#!/bin/bash
			
 
				+source ./install.sh
			
 
				+pushd exo/networking/grpc
			
 
				+python3 -m grpc_tools.protoc -I. --python_out=. --grpc_python_out=. node_service.proto
			
 
				+sed -i "s/import\ node_service_pb2/from . &/" node_service_pb2_grpc.py
			
 
				+popd
			
 
				+
			
--- a/setup.py
+++ b/setup.py
@@ -5,40 +5,37 @@ from setuptools import find_packages, setup
 
				 
			
 
				 # Base requirements for all platforms
			
 
				 install_requires = [
			
 
				-  "aiohttp==3.10.2",
			
 
				+  "aiohttp==3.10.11",
			
 
				   "aiohttp_cors==0.7.0",
			
 
				   "aiofiles==24.1.0",
			
 
				-  "grpcio==1.64.1",
			
 
				-  "grpcio-tools==1.64.1",
			
 
				+  "grpcio==1.68.0",
			
 
				+  "grpcio-tools==1.68.0",
			
 
				   "Jinja2==3.1.4",
			
 
				   "netifaces==0.11.0",
			
 
				   "numpy==2.0.0",
			
 
				+  "nuitka==2.4.10",
			
 
				   "nvidia-ml-py==12.560.30",
			
 
				   "pillow==10.4.0",
			
 
				   "prometheus-client==0.20.0",
			
 
				-  "protobuf==5.27.1",
			
 
				+  "protobuf==5.28.1",
			
 
				   "psutil==6.0.0",
			
 
				   "pydantic==2.9.2",
			
 
				   "requests==2.32.3",
			
 
				   "rich==13.7.1",
			
 
				-  "safetensors==0.4.3",
			
 
				   "tenacity==9.0.0",
			
 
				   "tqdm==4.66.4",
			
 
				-  "transformers==4.43.3",
			
 
				+  "transformers==4.46.3",
			
 
				   "uuid==1.30",
			
 
				-  "tinygrad @ git+https://github.com/tinygrad/tinygrad.git@232edcfd4f8b388807c64fb1817a7668ce27cbad",
			
 
				+  "tinygrad @ git+https://github.com/tinygrad/tinygrad.git@3b26e51fcebfc6576f4e0f99693e6f1406d61d79",
			
 
				 ]
			
 
				 
			
 
				 extras_require = {
			
 
				-  "linting": [
			
 
				-    "pylint==3.2.6",
			
 
				-    "ruff==0.5.5",
			
 
				-    "mypy==1.11.0",
			
 
				+  "formatting": [
			
 
				     "yapf==0.40.2",
			
 
				   ],
			
 
				   "apple_silicon": [
			
 
				-    "mlx==0.18.0",
			
 
				-    "mlx-lm==0.18.2",
			
 
				+    "mlx==0.20.0",
			
 
				+    "mlx-lm==0.19.3",
			
 
				   ],
			
 
				 }
			
 
				 
			
--- a/test/reconnect.sh
+++ b/test/reconnect.sh
@@ -1,7 +1,7 @@
 
				 #!/bin/bash
			
 
				 
			
 
				 echo "Starting node 1"
			
 
				-DEBUG_DISCOVERY=7 DEBUG=7 python3 main.py --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout 900 > output1.log 2>&1 &
			
 
				+DEBUG_DISCOVERY=7 DEBUG=7 python3 main.py --node-id "node1" --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 52415 --chatgpt-api-response-timeout 900 > output1.log 2>&1 &
			
 
				 PID1=$!
			
 
				 echo "Started node 1 PID: $PID1"
			
 
				 echo "Starting node 2"
			
--- a/test/test_model_helpers.py
+++ b/test/test_model_helpers.py
@@ -0,0 +1,121 @@
 
				+import unittest
			
 
				+from exo.models import get_supported_models, model_cards
			
 
				+from exo.inference.inference_engine import inference_engine_classes
			
 
				+from typing import NamedTuple
			
 
				+
			
 
				+class TestCase(NamedTuple):
			
 
				+  name: str
			
 
				+  engine_lists: list  # Will contain short names, will be mapped to class names
			
 
				+  expected_models_contains: list
			
 
				+  min_count: int | None
			
 
				+  exact_count: int | None
			
 
				+  max_count: int | None
			
 
				+
			
 
				+# Helper function to map short names to class names
			
 
				+def expand_engine_lists(engine_lists):
			
 
				+  def map_engine(engine):
			
 
				+    return inference_engine_classes.get(engine, engine)  # Return original name if not found
			
 
				+
			
 
				+  return [[map_engine(engine) for engine in sublist]
			
 
				+          for sublist in engine_lists]
			
 
				+
			
 
				+test_cases = [
			
 
				+  TestCase(
			
 
				+    name="single_mlx_engine",
			
 
				+    engine_lists=[["mlx"]],
			
 
				+    expected_models_contains=["llama-3.2-1b", "llama-3.1-70b", "mistral-nemo"],
			
 
				+    min_count=10,
			
 
				+    exact_count=None,
			
 
				+    max_count=None
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="single_tinygrad_engine",
			
 
				+    engine_lists=[["tinygrad"]],
			
 
				+    expected_models_contains=["llama-3.2-1b", "llama-3.2-3b"],
			
 
				+    min_count=5,
			
 
				+    exact_count=None,
			
 
				+    max_count=10
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="multiple_engines_or",
			
 
				+    engine_lists=[["mlx", "tinygrad"], ["mlx"]],
			
 
				+    expected_models_contains=["llama-3.2-1b", "llama-3.2-3b", "mistral-nemo"],
			
 
				+    min_count=10,
			
 
				+    exact_count=None,
			
 
				+    max_count=None
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="multiple_engines_all",
			
 
				+    engine_lists=[["mlx", "tinygrad"], ["mlx", "tinygrad"]],
			
 
				+    expected_models_contains=["llama-3.2-1b", "llama-3.2-3b", "mistral-nemo"],
			
 
				+    min_count=10,
			
 
				+    exact_count=None,
			
 
				+    max_count=None
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="distinct_engine_lists",
			
 
				+    engine_lists=[["mlx"], ["tinygrad"]],
			
 
				+    expected_models_contains=["llama-3.2-1b"],
			
 
				+    min_count=5,
			
 
				+    exact_count=None,
			
 
				+    max_count=10
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="no_engines",
			
 
				+    engine_lists=[],
			
 
				+    expected_models_contains=None,
			
 
				+    min_count=None,
			
 
				+    exact_count=len(model_cards),
			
 
				+    max_count=None
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="nonexistent_engine",
			
 
				+    engine_lists=[["NonexistentEngine"]],
			
 
				+    expected_models_contains=[],
			
 
				+    min_count=None,
			
 
				+    exact_count=0,
			
 
				+    max_count=None
			
 
				+  ),
			
 
				+  TestCase(
			
 
				+    name="dummy_engine",
			
 
				+    engine_lists=[["dummy"]],
			
 
				+    expected_models_contains=["dummy"],
			
 
				+    min_count=None,
			
 
				+    exact_count=1,
			
 
				+    max_count=None
			
 
				+  ),
			
 
				+]
			
 
				+
			
 
				+class TestModelHelpers(unittest.TestCase):
			
 
				+  def test_get_supported_models(self):
			
 
				+    for case in test_cases:
			
 
				+      with self.subTest(f"{case.name}_short_names"):
			
 
				+        result = get_supported_models(case.engine_lists)
			
 
				+        self._verify_results(case, result)
			
 
				+
			
 
				+      with self.subTest(f"{case.name}_class_names"):
			
 
				+        class_name_lists = expand_engine_lists(case.engine_lists)
			
 
				+        result = get_supported_models(class_name_lists)
			
 
				+        self._verify_results(case, result)
			
 
				+
			
 
				+  def _verify_results(self, case, result):
			
 
				+    if case.expected_models_contains:
			
 
				+      for model in case.expected_models_contains:
			
 
				+        self.assertIn(model, result)
			
 
				+
			
 
				+    if case.min_count:
			
 
				+      self.assertGreater(len(result), case.min_count)
			
 
				+
			
 
				+    if case.exact_count is not None:
			
 
				+      self.assertEqual(len(result), case.exact_count)
			
 
				+
			
 
				+    # Special case for distinct lists test
			
 
				+    if case.name == "distinct_engine_lists":
			
 
				+      self.assertLess(len(result), 10)
			
 
				+      self.assertNotIn("mistral-nemo", result)
			
 
				+
			
 
				+    if case.max_count:
			
 
				+      self.assertLess(len(result), case.max_count)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+  unittest.main()
			
--- a/test/test_tokenizers.py
+++ b/test/test_tokenizers.py
@@ -1,7 +1,7 @@
 
				 import os
			
 
				 import re
			
 
				 from transformers import AutoTokenizer, AutoProcessor
			
 
				-from exo.models import model_base_shards
			
 
				+from exo.models import model_cards
			
 
				 
			
 
				 
			
 
				 def test_tokenizer(name, tokenizer, verbose=False):
			
@@ -24,9 +24,14 @@ def test_tokenizer(name, tokenizer, verbose=False):
 
				     strip_tokens = lambda s: s.lstrip(tokenizer.decode([tokenizer.bos_token_id])).rstrip(tokenizer.decode([tokenizer.eos_token_id]))
			
 
				     assert text == strip_tokens(decoded) == strip_tokens(reconstructed)
			
 
				 
			
 
				-ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*", "dummy"]
			
 
				+ignore = ["TriAiExperiments/SFR-Iterative-DPO-LLaMA-3-70B-R", "mlx-community/DeepSeek-Coder-V2-Lite-Instruct-4bit-mlx", "mlx-community/DeepSeek-V2.5-MLX-AQ4_1_64", "llava-hf/llava-1.5-7b-hf", "mlx-community/Qwen*", "dummy", "mlx-community/Meta-Llama-3.1-405B-Instruct-8bit"]
			
 
				 ignore_pattern = re.compile(r"^(" + "|".join(model.replace("*", ".*") for model in ignore) + r")")
			
 
				-models = [shard.model_id for shards in model_base_shards.values() for shard in shards.values() if not ignore_pattern.match(shard.model_id)]
			
 
				+models = []
			
 
				+for model_id in model_cards:
			
 
				+  for engine_type, repo_id in model_cards[model_id].get("repo", {}).items():
			
 
				+    if not ignore_pattern.match(repo_id):
			
 
				+      models.append(repo_id)
			
 
				+models = list(set(models))
			
 
				 
			
 
				 verbose = os.environ.get("VERBOSE", "0").lower() == "1"
			
 
				 for m in models: