Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b05f559
Reduce CoreML conversion peak and ANE steady-state memory
ChinChangYang May 30, 2026
971fa9d
Enforce non-owning weight-view contract at compile time
ChinChangYang May 30, 2026
eeefc97
RAII the gzFile handle in KataGoParser
ChinChangYang May 31, 2026
6bfa617
Replace WeightEntry raw ptr+count with a local FloatView
ChinChangYang May 31, 2026
4159930
Clarify weight-release safety comment: aneOnly is the guarantee
ChinChangYang May 31, 2026
44342a3
Refactor weight release into per-struct releaseWeights() methods
ChinChangYang May 31, 2026
98b17eb
Co-locate releaseWeights() defs with each struct's other methods
ChinChangYang May 31, 2026
3939012
Merge lightvector/master into feature/coreml-conversion-memory-levers
ChinChangYang Jun 1, 2026
b4459da
Add Metal GPU + CoreML/ANE transformer support for b10c384h6nbttflrs …
ChinChangYang Jun 1, 2026
c5fd1e2
Merge branch 'master' into feature/metal-transformer-silu-b10c384h6
ChinChangYang Jun 1, 2026
6f8314b
Fix Metal GPU crash on GQA transformer attention (NDArray INT_MAX)
ChinChangYang Jun 1, 2026
13c4b30
Fix dropped SiLU activation in CoreML value/policy/meta heads
ChinChangYang Jun 1, 2026
792c476
Implement GQA support in CoreML/ANE MIL attention builder
ChinChangYang Jun 1, 2026
3839e52
Fix CoreML/ANE FP16 transformer accuracy via precision tiers
ChinChangYang Jun 1, 2026
3eb81ce
Refactor: dedupe CoreML global-pooling FP32 wrap
ChinChangYang Jun 1, 2026
d052d2a
Fix CoreML/ANE convnet regression: scope FP32 tiers to transformers
ChinChangYang Jun 2, 2026
f770888
Merge remote-tracking branch 'origin/master' into feature/metal-trans…
ChinChangYang Jun 2, 2026
1459024
Fix RoPE cos/sin table buffer leak in Metal backend
ChinChangYang Jun 2, 2026
c590c3a
Guard non-SwiGLU transformer FFN in Metal backend
ChinChangYang Jun 2, 2026
39f82f6
Use named constant for trunk norm kind in Metal backend
ChinChangYang Jun 2, 2026
895929f
Merge feature/metal-transformer-silu-b10c384h6 (PR #1205) into featur…
ChinChangYang Jun 4, 2026
8481a94
Conform CoreML transformer derived consts to the owned-weight + FP32 …
ChinChangYang Jun 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions cpp/external/katagocoreml/src/Converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ void KataGoConverter::convert(const std::string& input_path,
throw std::invalid_argument("max_batch_size must be >= min_batch_size or <= 0 for unlimited");
}

// Parse KataGo model
KataGoParser parser(input_path);
KataGoModelDesc model = parser.parse();
// Parse KataGo model (parser + its decompressed buffer freed at end of scope)
KataGoModelDesc model;
{
KataGoParser parser(input_path);
model = parser.parse();
}

// Determine if using FP16 precision
bool use_fp16 = (options.compute_precision == "FLOAT16");
Expand All @@ -52,9 +55,8 @@ void KataGoConverter::convert(const std::string& input_path,
options.use_fp16_io);
auto program = builder.build();

// Get weights from builder
auto weights = builder.getWeights();
std::vector<WeightEntry> weights_copy(weights.begin(), weights.end());
// Serialize directly from the builder's weight views (no copy).
std::vector<WeightEntry>& weights = builder.getWeightsMutable();

// Update options with model metadata for serialization
ConversionOptions final_options = options;
Expand Down Expand Up @@ -82,7 +84,7 @@ void KataGoConverter::convert(const std::string& input_path,

// Serialize to .mlpackage
CoreMLSerializer serializer(final_options.specification_version);
serializer.serialize(program.get(), weights_copy, output_path, final_options);
serializer.serialize(program.get(), weights, output_path, final_options);
}

ModelInfo KataGoConverter::getModelInfo(const std::string& input_path) {
Expand Down
940 changes: 920 additions & 20 deletions cpp/external/katagocoreml/src/builder/MILBuilder.cpp

Large diffs are not rendered by default.

87 changes: 85 additions & 2 deletions cpp/external/katagocoreml/src/builder/MILBuilder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ class MILBuilder {
/// @return Unique pointer to MIL Program protobuf
std::unique_ptr<CoreML::Specification::MILSpec::Program> build();

/// Get weight entries for blob serialization
const std::vector<WeightEntry>& getWeights() const { return m_ops.getWeights(); }
/// Get weight entries for blob serialization (mutable; serialization sets blob_offset)
std::vector<WeightEntry>& getWeightsMutable() { return m_ops.getWeightsMutable(); }

/// Get board dimensions
int getBoardXSize() const { return m_board_x_size; }
Expand All @@ -43,6 +43,16 @@ class MILBuilder {
bool m_optimize_identity_mask;
bool m_use_fp16;
bool m_use_fp16_io;
// FP32-in-FP16-mode escalations all run off the FP16-only ANE, so they apply ONLY to transformer
// trunks (attention widens activation range, overflowing FP16 conv/matmul/pooling accumulation).
// Plain convnets run pure FP16 on the ANE -- the long-standing pre-tier path, verified to pass
// testgpuerror (b18c384nbt) and ~2.3x faster than forcing their per-block global pooling to FP32.
// For transformers: narrow trunks (<256) build fully FP32; wider ones use non-spatial FP32 (matmuls +
// pooling) plus, for very wide trunks (>=320), conv FP32. RMSNorm reductions: FP32 when m_use_fp16.
static constexpr int CONV_FP32_MIN_TRUNK_CHANNELS = 320; // transformer convs run FP32 at/above this width
static constexpr int FULL_FP32_MAX_TRUNK_CHANNELS = 256; // transformer trunks below this build fully FP32
bool m_nonspatial_fp32 = false; // = m_use_fp16 && hasTransformer (matmuls + global pooling)
bool m_conv_fp32 = false; // = m_use_fp16 && hasTransformer && trunk_channels >= CONV_FP32_MIN_...
int m_min_batch_size;
int m_max_batch_size;
CoreML::Specification::MILSpec::DataType m_weight_dtype;
Expand Down Expand Up @@ -80,6 +90,24 @@ class MILBuilder {
const std::vector<float>& data,
const std::vector<int64_t>& shape);

// addConstOp registers a NON-OWNING view into `data` (see WeightEntry), so the
// backing storage must outlive serialization. Binding a temporary here would
// dangle. Deleted so such calls fail to compile; use addOwnedConstOp for
// derived/temporary tensors that KataGoOps should own instead.
void addConstOp(CoreML::Specification::MILSpec::Block* block,
const std::string& name,
std::vector<float>&& data,
const std::vector<int64_t>& shape) = delete;

void addOwnedConstOp(CoreML::Specification::MILSpec::Block* block,
const std::string& name,
std::vector<float>&& data,
const std::vector<int64_t>& shape);

void emitConstOp(CoreML::Specification::MILSpec::Block* block,
const std::string& name,
const std::vector<int64_t>& shape);

void addIntArrayConstOp(CoreML::Specification::MILSpec::Block* block,
const std::string& name,
const std::vector<int32_t>& values);
Expand All @@ -102,6 +130,23 @@ class MILBuilder {
const std::string& dtype,
const std::vector<int64_t>& shape);

// Cast to a tensor with FULLY-specified dims (no forced batch dim like addCastOp). Use for
// weight tensors (fixed [in,out] dims) when running an otherwise-FP16 op in FP32. Returns the
// new tensor name. dims use -1 for an unknown/batch dim, >=0 for a constant dim.
std::string castFixed(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const std::string& dtype,
const std::vector<int64_t>& dims);

// Emit global pooling, running it in FP32 when m_nonspatial_fp32 (cast input/mask up, pool,
// cast the pooled features back to FP16). valueVariant selects the value-head pooling variant.
void addGlobalPoolingFp32(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const std::string& mask,
int channels,
const std::string& output,
bool valueVariant);

void addConvOp(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const ConvLayerDesc& layer,
Expand All @@ -120,6 +165,44 @@ class MILBuilder {
int rank,
int channels);

void addSiluOps(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const std::string& output,
int rank,
int channels);

// Generic output-shape setter: dims with -1 entries become unknown/dynamic dimensions.
void setShape(CoreML::Specification::MILSpec::Operation* op,
const std::string& name,
const std::vector<int64_t>& dims);

// Lightweight transformer RMSNorm (weight only, per-position over channels). NCHW in/out.
std::string addTransformerRMSNorm(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const TransformerRMSNormDesc& desc,
const std::string& mask,
const std::string& prefix);

// Full RMSNorm at trunk tip: gamma/beta, spatial or per-position, fused activation. NCHW in/out.
std::string addTrunkRMSNorm(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const RMSNormLayerDesc& desc,
const ActivationLayerDesc& act,
const std::string& mask,
const std::string& prefix);

std::string buildTransformerAttentionBlock(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const TransformerAttentionBlockDesc& block_desc,
const std::string& mask,
const std::string& prefix);

std::string buildTransformerFFNBlock(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const TransformerFFNBlockDesc& block_desc,
const std::string& mask,
const std::string& prefix);

void addGlobalPoolingOps(CoreML::Specification::MILSpec::Block* block,
const std::string& input,
const std::string& mask,
Expand Down
22 changes: 20 additions & 2 deletions cpp/external/katagocoreml/src/builder/Operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,30 @@ KataGoOps::KataGoOps(int board_x_size, int board_y_size, bool optimize_identity_

std::string KataGoOps::registerWeight(const std::string& name,
const std::vector<float>& data,
const std::vector<int64_t>& shape) {
const std::vector<int64_t>& shape,
bool is_fp32) {
WeightEntry entry;
entry.name = name;
entry.data = data;
entry.data = FloatView{data.data(), data.size()};
entry.shape = shape;
entry.blob_offset = 0; // Will be set during serialization
entry.is_fp32 = is_fp32;
m_weights.push_back(std::move(entry));
return name;
}

std::string KataGoOps::registerOwnedWeight(const std::string& name,
std::vector<float>&& data,
const std::vector<int64_t>& shape,
bool is_fp32) {
m_owned.push_back(std::move(data));
const std::vector<float>& stored = m_owned.back();
WeightEntry entry;
entry.name = name;
entry.data = FloatView{stored.data(), stored.size()};
entry.shape = shape;
entry.blob_offset = 0;
entry.is_fp32 = is_fp32;
m_weights.push_back(std::move(entry));
return name;
}
Expand Down
50 changes: 42 additions & 8 deletions cpp/external/katagocoreml/src/builder/Operations.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,33 @@

#include "../types/KataGoTypes.hpp"
#include <cmath>
#include <deque>
#include <string>
#include <vector>

namespace katagocoreml {

/// Weight entry for blob file storage
/// Minimal non-owning view over a contiguous float buffer. KataGo-local on
/// purpose: keeps the MILBlob dependency out of this header (conversion to
/// MILBlob::Util::Span happens only at the serializer boundary).
struct FloatView {
const float* ptr = nullptr;
size_t len = 0;
const float* data() const { return ptr; }
size_t size() const { return len; }
bool empty() const { return len == 0; }
float operator[](size_t i) const { return ptr[i]; }
};

/// Weight entry for blob file storage. `data` is a NON-OWNING view into the live
/// KataGoModelDesc (or into KataGoOps::m_owned for derived tensors).
struct WeightEntry {
std::string name;
std::vector<float> data;
FloatView data; // non-owning view (replaces raw ptr + count)
std::vector<int64_t> shape;
uint64_t blob_offset = 0; // Set during serialization
bool is_fp32 = false; // Store as FP32 (set when the const was declared FP32, e.g. inside an
// FP32 sub-region of an otherwise-FP16 model). Else stored per global mode.
};

/// Precomputed constants for identity mask optimization
Expand Down Expand Up @@ -51,16 +67,33 @@ class KataGoOps {
/// Get precomputed mask constants
const MaskConstants& getMaskConstants() const { return m_mask_constants; }

/// Register a weight tensor and return its reference name
/// Register a weight that lives in the model (stored as a non-owning view).
/// is_fp32 marks it for FP32 storage.
std::string registerWeight(const std::string& name,
const std::vector<float>& data,
const std::vector<int64_t>& shape);
const std::vector<int64_t>& shape,
bool is_fp32 = false);

/// The stored WeightEntry is a non-owning view into `data`, so a temporary
/// would leave it dangling. Deleted to reject such calls at compile time;
/// use registerOwnedWeight for tensors KataGoOps should own.
std::string registerWeight(const std::string& name,
std::vector<float>&& data,
const std::vector<int64_t>& shape) = delete;

/// Register a derived/temporary weight; KataGoOps takes ownership so the
/// view stays valid through serialization. is_fp32 marks it for FP32 storage
/// (mirrors registerWeight) so the stored dtype matches the declared const dtype.
std::string registerOwnedWeight(const std::string& name,
std::vector<float>&& data,
const std::vector<int64_t>& shape,
bool is_fp32 = false);

/// Get all registered weights
const std::vector<WeightEntry>& getWeights() const { return m_weights; }
/// Get all registered weights (mutable; serialization sets blob_offset)
std::vector<WeightEntry>& getWeightsMutable() { return m_weights; }

/// Clear all registered weights
void clearWeights() { m_weights.clear(); }
/// Clear all registered weights (and their owned backing buffers)
void clearWeights() { m_weights.clear(); m_owned.clear(); }

/// Generate unique operation name
std::string genOpName(const std::string& prefix);
Expand All @@ -71,6 +104,7 @@ class KataGoOps {
bool m_optimize_identity_mask;
MaskConstants m_mask_constants;
std::vector<WeightEntry> m_weights;
std::deque<std::vector<float>> m_owned;
int m_op_counter = 0;
};

Expand Down
Loading
Loading