Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
81b00db
Add MLX backend for Apple Silicon
ChinChangYang May 23, 2026
628e377
MLX backend: ANE/CoreML correctness + concurrency fixes, cross-path p…
ChinChangYang May 26, 2026
d5647ed
Merge remote-tracking branch 'origin/master' into mlx-backend-squash
ChinChangYang Jun 2, 2026
19d7617
Fix MLX createComputeContext signature for merged nninterface API
ChinChangYang Jun 3, 2026
01d1e3a
Add transformer support to MLX backend
ChinChangYang Jun 3, 2026
be1513d
Port CoreML/ANE transformer support to MLX backend ANE mux
ChinChangYang Jun 3, 2026
8ffe8ad
Keep MLX GPU pooling in fp16 when useFP16
ChinChangYang Jun 3, 2026
3f45e0c
Make MLX Winograd tuner robust to failing candidates and drop redunda…
ChinChangYang Jun 3, 2026
735030e
Remove dead post-project CMAKE_OSX_DEPLOYMENT_TARGET pins
ChinChangYang Jun 3, 2026
829d4fb
Validate transformer attention projection dims in CoreML converter
ChinChangYang Jun 3, 2026
b6e32af
Reduce CoreML conversion peak and ANE steady-state memory
ChinChangYang May 30, 2026
fa4feb6
Enforce non-owning weight-view contract at compile time
ChinChangYang May 30, 2026
e7f1b98
RAII the gzFile handle in KataGoParser
ChinChangYang May 31, 2026
23d9822
Replace WeightEntry raw ptr+count with a local FloatView
ChinChangYang May 31, 2026
93289e0
Clarify weight-release safety comment: aneOnly is the guarantee
ChinChangYang May 31, 2026
4630a0d
Refactor weight release into per-struct releaseWeights() methods
ChinChangYang May 31, 2026
4cca6cc
Co-locate releaseWeights() defs with each struct's other methods
ChinChangYang May 31, 2026
35ba9d8
Conform CoreML transformer derived consts to the owned-weight + FP32 …
ChinChangYang Jun 4, 2026
4839b37
Cover transformer descriptors in releaseWeights()
ChinChangYang Jun 4, 2026
f5565a1
Release in-memory weights on the MLX backend's ANE-only path
ChinChangYang Jun 4, 2026
c741171
Validate transformer FFN matmul dimensions in CoreML parser
ChinChangYang Jun 4, 2026
95db996
Use ScopedFp32 RAII for all CoreML FP32-escalation windows
ChinChangYang Jun 4, 2026
ce18e63
Remove dead Model::apply() on the MLX backend
ChinChangYang Jun 4, 2026
7c71f5d
Normalize USE_BACKEND case before project(); clarify backend-agnostic…
ChinChangYang Jun 5, 2026
55b46ff
Accumulate MLX Winograd F(2,3) transforms in fp32
ChinChangYang Jun 5, 2026
3ae836d
Make MLX Winograd auto-tune coarse; reserve the wide sweep for full t…
ChinChangYang Jun 6, 2026
f2d89db
Fail loudly on unknown MLX block kinds; fix MLX build docs
ChinChangYang Jun 6, 2026
3fe7526
Wire the MLX winotuner into the ./katago tuner subcommand
ChinChangYang Jun 6, 2026
da9eb7b
Remove KATAGO_MLX_WINOTUNER_FULL; full sweep is command-only
ChinChangYang Jun 6, 2026
4deeaf4
Remove KATAGO_MLX_WINOTUNER_FORCE; re-tune is command-only
ChinChangYang Jun 6, 2026
ecab6c3
Add protobuf/abseil to the MLX build prerequisites in Compiling.md
ChinChangYang Jun 7, 2026
51b028d
Key the MLX Winograd tuner cache by GPU chip, not a fixed string
ChinChangYang Jun 7, 2026
5224098
Keep MLX GPU attention in fp16 (avoid accidental fp32 promotion)
ChinChangYang Jun 7, 2026
a6963d3
Resolve MLX backend review NITs: atomic tuner save, oracle test, dead…
ChinChangYang Jun 8, 2026
8afe269
Resolve MLX review findings: tuner +inf guard, registerWeight rvalue …
ChinChangYang Jun 9, 2026
1bfc4af
CI: add macOS MLX backend build job to build.yml
ChinChangYang Jun 11, 2026
7455e01
MLX: serialize GPU eval across NN server threads
ChinChangYang Jun 11, 2026
c74ce51
MLX: faster Winograd autotuner (fast coarse path, greedy descent, cro…
ChinChangYang Jun 11, 2026
f3ddff0
MLX: encapsulate tune memo and drain the ComputeHandle ctor
ChinChangYang Jun 11, 2026
1b0cbf9
MLX: unify the flat-sweep diagnostic logging into one helper
ChinChangYang Jun 12, 2026
db34ce4
Fix Windows CI: stop pinning CMake to the VS 2022 generator (#1208)
ChinChangYang Jun 12, 2026
b10ff59
MLX: overlap command encoding with GPU execution
ChinChangYang Jun 12, 2026
63446b6
MLX: fuse BN+act and residual epilogues into Winograd untransform
ChinChangYang Jun 13, 2026
d1052c7
MLX: fuse the gpool block's final residual into the Winograd untransform
ChinChangYang Jun 13, 2026
cbb6f20
MLX: fuse gpool regularConv bias-add + BN+act into the Winograd untra…
ChinChangYang Jun 13, 2026
90e8b9f
MLX: harden backend memcpy size + tuner cache parsing
ChinChangYang Jun 13, 2026
5534bfb
CoreML converter: restore parser validations + fix IO dtype mismatch
ChinChangYang Jun 13, 2026
2dc563c
MLX: tuner/backend review hardening (memo key, isValid bounds, fail-l…
ChinChangYang Jun 14, 2026
b40230f
CoreML parser: validate RMSNorm epsilon (in (0,1]) matching master
ChinChangYang Jun 14, 2026
5f64ed3
CoreML parser: validate nested numBlocks and matbias numChannels
ChinChangYang Jun 14, 2026
5b5899c
Refactor MainCmds::tuner into a thin per-backend dispatcher
ChinChangYang Jun 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 61 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,66 @@ jobs:
name: katago-macos-metal
path: cpp/katago

build-macos-mlx:
# macos-latest is Apple Silicon (arm64), which the MLX backend requires.
runs-on: macos-latest
permissions:
contents: read

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install dependencies
run: |
brew install ninja zlib libzip protobuf abseil mlx

# The CMake build (build.ninja, CMakeCache.txt) bakes in version-pinned
# Homebrew Cellar paths for protobuf/abseil/mlx (e.g.
# -L/opt/homebrew/Cellar/mlx/0.31.2/lib). When Homebrew bumps those
# the cached paths go stale and the link fails.
# Capture the installed versions into the cache key so
# a formula bump invalidates the cache and forces a fresh configure.
- name: Capture dependency versions for cache key
id: dep-versions
run: |
echo "versions=$(brew list --versions protobuf abseil mlx | tr '\n' '-')" >> "$GITHUB_OUTPUT"

- name: Cache CMake build
uses: actions/cache@v4
with:
path: |
cpp/CMakeCache.txt
cpp/CMakeFiles
cpp/build.ninja
cpp/.ninja_deps
cpp/.ninja_log
key: ${{ runner.os }}-cmake-mlx-${{ steps.dep-versions.outputs.versions }}-${{ hashFiles('**/CMakeLists.txt') }}
restore-keys: |
${{ runner.os }}-cmake-mlx-${{ steps.dep-versions.outputs.versions }}-

- name: Configure CMake
working-directory: cpp
run: |
cmake . -G Ninja -DUSE_BACKEND=MLX -DCMAKE_BUILD_TYPE=Release

- name: Build
working-directory: cpp
run: |
ninja

- name: Run tests
working-directory: cpp
run: |
./katago runtests

- name: Upload artifact
if: github.event_name == 'push' && github.ref == 'refs/heads/master'
uses: actions/upload-artifact@v4
with:
name: katago-macos-mlx
path: cpp/katago

build-windows:
runs-on: windows-latest
permissions:
Expand Down Expand Up @@ -185,7 +245,7 @@ jobs:
- name: Configure CMake
working-directory: cpp
run: |
cmake . -G "Visual Studio 17 2022" -A x64 `
cmake . -A x64 `
-DUSE_BACKEND=OPENCL `
-DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"

Expand Down
9 changes: 5 additions & 4 deletions Compiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,23 +131,24 @@ As also mentioned in the instructions below but repeated here for visibility, if
* [Homebrew](https://brew.sh): `/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"`
* CMake with a minimum version of 3.18.2: `brew install cmake`.
* AppleClang and Swift compilers: `xcode-select --install`.
* If using the Metal backend, [Ninja](https://ninja-build.org): `brew install ninja`
* If using the Metal backend, protobuf and abseil: `brew install protobuf abseil`
* If using the Metal or MLX backend, [Ninja](https://ninja-build.org): `brew install ninja`
* If using the Metal or MLX backend, protobuf and abseil: `brew install protobuf abseil`
* If using the MLX backend (Apple Silicon only): `brew install mlx` (≥0.18). Requires CMake ≥3.27. KataGo finds MLX via CMake's default search (Homebrew installs it at `/opt/homebrew/share/cmake/MLX/`); override with `-DMLX_ROOT=/path/to/mlx/cmake` if needed.
* libzip: `brew install libzip`.
* If you want to do self-play training and research, probably Google perftools `brew install gperftools` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
* If compiling to contribute to public distributed training runs, OpenSSL is required (`brew install openssl`).
* Clone this repo:
* `git clone https://github.com/lightvector/KataGo.git`
* Compile using CMake and make in the cpp directory:
* `cd KataGo/cpp`
* `cmake . -G Ninja -DUSE_BACKEND=METAL` or `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want.
* `cmake . -G Ninja -DUSE_BACKEND=METAL` or `cmake . -G Ninja -DUSE_BACKEND=MLX` or `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want. The METAL and MLX backends use Swift/C++ interop, which requires the Ninja generator (`-G Ninja`); the other backends use the default Make generator.
* Specify also `-DUSE_TCMALLOC=1` if using TCMalloc.
* Compiling will also call git commands to embed the git hash into the compiled executable, specify also `-DNO_GIT_REVISION=1` to disable it if this is causing issues for you.
* Specify `-DUSE_AVX2=1` to also compile Eigen with AVX2 and FMA support, which will make it incompatible with old CPUs but much faster. Intel-based Macs with new processors support AVX2, but Apple Silicon Macs do not support AVX2 natively. (If you want to go further, you can also add `-DCMAKE_CXX_FLAGS='-march=native'` which will specialize to precisely your machine's CPU, but the exe might not run on other machines at all).
* Specify `-DBUILD_DISTRIBUTED=1` to compile with support for contributing data to public distributed training runs.
* If building distributed, you will also need to build with Git revision support, including building within a clone of the repo, as opposed to merely an unzipped copy of its source.
* Only builds from specific tagged versions or branches can contribute, in particular, instead of the `master` branch, use either the latest [release](https://github.com/lightvector/KataGo/releases) tag or the tip of the `stable` branch. To minimize the chance of any data incompatibilities or bugs, please do NOT attempt to contribute with custom changes or circumvent these limitations.
* `ninja` for Metal backend, or `make` for other backends.
* `ninja` for the Metal and MLX backends, or `make` for other backends.
* Done! You should now have a compiled `katago` executable in your working directory.
* Pre-trained neural nets are available at [the main training website](https://katagotraining.org/).
* You will probably want to edit `configs/gtp_example.cfg` (see "Tuning for Performance" above).
Expand Down
104 changes: 100 additions & 4 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
cmake_minimum_required(VERSION 3.18.2)
if(USE_BACKEND STREQUAL "METAL")

# Pre-project MLX setup. KataGo's MLX path enforces CMake 3.27 via the guard
# below (MLX itself requires only 3.25 - 3.27 is chosen to match
# cmake_policy(VERSION 3.27)); the global cmake_minimum_required stays at
# 3.18.2 so non-MLX backends keep building on older CMake.
#
# The OSX deployment target is deliberately NOT pinned here. KataGo links
# Homebrew's prebuilt libmlx.dylib, whose minos reflects the macOS it was
# bottled on - that dylib, not this build, sets the real minimum macOS.
# Pinning a lower value only stamps a misleading minos on the executable and
# triggers a "linking with dylib built for newer version" linker warning;
# letting CMake default the target to the build host keeps minos honest.
# (A post-project set(CMAKE_OSX_DEPLOYMENT_TARGET) is a silent no-op for this
# Swift project - the target is fixed during project()/enable_language - so it
# is not pinned in the backend branches below either.)
# Normalize the backend name to uppercase BEFORE project(), so the
# case-insensitive behavior of the post-project() string(TOUPPER ...) below
# also applies to the pre-project() MLX version guard and the Swift language
# selection. Without this, a lowercase -DUSE_BACKEND=mlx would silently skip
# the 3.27 guard and the Swift enablement, then still build Swift sources later.
string(TOUPPER "${USE_BACKEND}" USE_BACKEND_NORMALIZED)

if(USE_BACKEND_NORMALIZED STREQUAL "MLX")
if(CMAKE_VERSION VERSION_LESS 3.27)
message(FATAL_ERROR "KataGo's USE_BACKEND=MLX path requires CMake 3.27 or newer. You have ${CMAKE_VERSION}. Install via: brew install cmake")
endif()
cmake_policy(VERSION 3.27)
endif()

if(USE_BACKEND_NORMALIZED STREQUAL "METAL" OR USE_BACKEND_NORMALIZED STREQUAL "MLX")
project(katago LANGUAGES CXX Swift)
else()
project(katago)
Expand Down Expand Up @@ -44,7 +73,7 @@ endif()
set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
set(USE_BACKEND CACHE STRING "Neural net backend")
string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN METAL)
set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN MLX METAL)

set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
Expand Down Expand Up @@ -134,7 +163,6 @@ elseif(USE_BACKEND STREQUAL "METAL")

include(InitializeSwift)
include(AddSwift)
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)
set(NEURALNET_BACKEND_SOURCES
neuralnet/metalbackend.cpp
)
Expand Down Expand Up @@ -164,8 +192,72 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
set(NEURALNET_BACKEND_SOURCES
neuralnet/eigenbackend.cpp
)
elseif(USE_BACKEND STREQUAL "MLX")
message(STATUS "-DUSE_BACKEND=MLX, using MLX backend (with CoreML/ANE MUX) for Apple Silicon.")

if(NOT APPLE)
message(FATAL_ERROR "USE_BACKEND=MLX is only supported on macOS. Detected: ${CMAKE_SYSTEM_NAME}")
endif()
if(CMAKE_OSX_ARCHITECTURES)
if(NOT CMAKE_OSX_ARCHITECTURES STREQUAL "arm64")
message(FATAL_ERROR "USE_BACKEND=MLX requires arm64. Got: ${CMAKE_OSX_ARCHITECTURES}")
endif()
elseif(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
message(FATAL_ERROR "USE_BACKEND=MLX requires Apple Silicon (arm64). Detected: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

# CoreML/ANE MUX prerequisites — same constraints the METAL branch above
# enforces (same wording for grep parity).
if(NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")
message(FATAL_ERROR "Bidirectional C++ Interop requires Ninja generator. Have ${CMAKE_GENERATOR}")
endif()
if("${CMAKE_Swift_COMPILER_VERSION}" VERSION_LESS 5.9)
message(FATAL_ERROR "Bidirectional C++ Interop requires Swift 5.9 or greater. Have ${CMAKE_Swift_COMPILER_VERSION}")
endif()
if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
message(FATAL_ERROR "Project requires building with AppleClang. Have ${CMAKE_CXX_COMPILER_ID}")
endif()

# katagocoreml provides the native CoreML conversion C++ library used by the ANE mux.
add_subdirectory(external/katagocoreml)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/external/macos/cmake/modules")

if (NOT CMAKE_OSX_SYSROOT)
execute_process(COMMAND xcrun --show-sdk-path OUTPUT_VARIABLE CMAKE_OSX_SYSROOT OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()

include(InitializeSwift)
include(AddSwift)

set(MLX_MIN_VERSION "0.18")
set(MLX_ROOT "" CACHE PATH "Optional path to MLX's CMake package; leave empty to use CMake's default search (e.g. Homebrew's /opt/homebrew/share/cmake/MLX/)")

# Homebrew installs MLX's CMake config to /opt/homebrew/share/cmake/MLX/, which is
# on CMake's default search path. MLX_ROOT, when set, is added as an extra hint.
find_package(MLX ${MLX_MIN_VERSION} CONFIG REQUIRED HINTS "${MLX_ROOT}")
message(STATUS "Found MLX ${MLX_VERSION} at ${MLX_LIBRARY}")

set(NEURALNET_BACKEND_SOURCES
neuralnet/mlxbackend.cpp
neuralnet/mlxwinotuner.cpp
neuralnet/mlxtests.cpp
)

# Build the KataGoSwift static library. Same lines as the METAL branch above,
# kept inline to leave the Metal branch untouched. The library exposes
# CoreMLComputeHandle to C++ via the generated KataGoSwift-swift.h.
add_library(KataGoSwift STATIC
neuralnet/metalbackend.swift
neuralnet/metallayers.swift)
_swift_generate_cxx_header(
KataGoSwift
"${CMAKE_CURRENT_BINARY_DIR}/include/KataGoSwift/KataGoSwift-swift.h")
target_include_directories(KataGoSwift PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/include")
set_target_properties(KataGoSwift PROPERTIES Swift_MODULE_NAME "KataGoSwift")
target_compile_options(KataGoSwift PUBLIC
"$<$<COMPILE_LANGUAGE:Swift>:-cxx-interoperability-mode=default>")
elseif(USE_BACKEND STREQUAL "")
message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN or -DUSE_BACKEND=MLX or -DUSE_BACKEND=METAL to compile with the respective backend.${ColorReset}")
set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
else()
message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND})
Expand Down Expand Up @@ -531,6 +623,10 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
message(STATUS "Found Eigen3 at ${EIGEN3_INCLUDE_DIRS}")
endif()
endif()
elseif(USE_BACKEND STREQUAL "MLX")
target_compile_definitions(katago PRIVATE USE_MLX_BACKEND)
target_link_libraries(katago mlx KataGoSwift katagocoreml
${KATAGOCOREML_DEP_LDFLAGS})
endif()

if(USE_BIGGER_BOARDS_EXPENSIVE)
Expand Down
3 changes: 3 additions & 0 deletions cpp/command/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ int MainCmds::benchmark(const vector<string>& args) {
#endif
#ifdef USE_EIGEN_BACKEND
cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
#endif
#ifdef USE_MLX_BACKEND
cout << "Your GTP config is currently set to mlxUseFP16 = " << nnEval->getUsingFP16Mode().toString() << endl;
#endif
cout << endl;
cout << "Your GTP config is currently set to use numSearchThreads = " << params.numThreads << endl;
Expand Down
Loading
Loading