lightvector · ChinChangYang · May 30, 2026 · May 30, 2026 · May 31, 2026 · May 31, 2026
diff --git a/cpp/external/katagocoreml/src/Converter.cpp b/cpp/external/katagocoreml/src/Converter.cpp
@@ -29,9 +29,12 @@ void KataGoConverter::convert(const std::string& input_path,
         throw std::invalid_argument("max_batch_size must be >= min_batch_size or <= 0 for unlimited");
     }
 
-    // Parse KataGo model
-    KataGoParser parser(input_path);
-    KataGoModelDesc model = parser.parse();
+    // Parse KataGo model (parser + its decompressed buffer freed at end of scope)
+    KataGoModelDesc model;
+    {
+        KataGoParser parser(input_path);
+        model = parser.parse();
+    }
 
     // Determine if using FP16 precision
     bool use_fp16 = (options.compute_precision == "FLOAT16");
@@ -52,9 +55,8 @@ void KataGoConverter::convert(const std::string& input_path,
                        options.use_fp16_io);
     auto program = builder.build();
 
-    // Get weights from builder
-    auto weights = builder.getWeights();
-    std::vector<WeightEntry> weights_copy(weights.begin(), weights.end());
+    // Serialize directly from the builder's weight views (no copy).
+    std::vector<WeightEntry>& weights = builder.getWeightsMutable();
 
     // Update options with model metadata for serialization
     ConversionOptions final_options = options;
@@ -82,7 +84,7 @@ void KataGoConverter::convert(const std::string& input_path,
 
     // Serialize to .mlpackage
     CoreMLSerializer serializer(final_options.specification_version);
-    serializer.serialize(program.get(), weights_copy, output_path, final_options);
+    serializer.serialize(program.get(), weights, output_path, final_options);
 }
 
 ModelInfo KataGoConverter::getModelInfo(const std::string& input_path) {

diff --git a/cpp/external/katagocoreml/src/builder/MILBuilder.cpp b/cpp/external/katagocoreml/src/builder/MILBuilder.cpp
diff --git a/cpp/external/katagocoreml/src/builder/MILBuilder.hpp b/cpp/external/katagocoreml/src/builder/MILBuilder.hpp
@@ -29,8 +29,8 @@ class MILBuilder {
     /// @return Unique pointer to MIL Program protobuf
     std::unique_ptr<CoreML::Specification::MILSpec::Program> build();
 
-    /// Get weight entries for blob serialization
-    const std::vector<WeightEntry>& getWeights() const { return m_ops.getWeights(); }
+    /// Get weight entries for blob serialization (mutable; serialization sets blob_offset)
+    std::vector<WeightEntry>& getWeightsMutable() { return m_ops.getWeightsMutable(); }
 
     /// Get board dimensions
     int getBoardXSize() const { return m_board_x_size; }
@@ -43,6 +43,16 @@ class MILBuilder {
     bool m_optimize_identity_mask;
     bool m_use_fp16;
     bool m_use_fp16_io;
+    // FP32-in-FP16-mode escalations all run off the FP16-only ANE, so they apply ONLY to transformer
+    // trunks (attention widens activation range, overflowing FP16 conv/matmul/pooling accumulation).
+    // Plain convnets run pure FP16 on the ANE -- the long-standing pre-tier path, verified to pass
+    // testgpuerror (b18c384nbt) and ~2.3x faster than forcing their per-block global pooling to FP32.
+    // For transformers: narrow trunks (<256) build fully FP32; wider ones use non-spatial FP32 (matmuls +
+    // pooling) plus, for very wide trunks (>=320), conv FP32. RMSNorm reductions: FP32 when m_use_fp16.
+    static constexpr int CONV_FP32_MIN_TRUNK_CHANNELS = 320;   // transformer convs run FP32 at/above this width
+    static constexpr int FULL_FP32_MAX_TRUNK_CHANNELS = 256;   // transformer trunks below this build fully FP32
+    bool m_nonspatial_fp32 = false;  // = m_use_fp16 && hasTransformer (matmuls + global pooling)
+    bool m_conv_fp32 = false;        // = m_use_fp16 && hasTransformer && trunk_channels >= CONV_FP32_MIN_...
     int m_min_batch_size;
     int m_max_batch_size;
     CoreML::Specification::MILSpec::DataType m_weight_dtype;
@@ -80,6 +90,24 @@ class MILBuilder {
                     const std::vector<float>& data,
                     const std::vector<int64_t>& shape);
 
+    // addConstOp registers a NON-OWNING view into `data` (see WeightEntry), so the
+    // backing storage must outlive serialization. Binding a temporary here would
+    // dangle. Deleted so such calls fail to compile; use addOwnedConstOp for
+    // derived/temporary tensors that KataGoOps should own instead.
+    void addConstOp(CoreML::Specification::MILSpec::Block* block,
+                    const std::string& name,
+                    std::vector<float>&& data,
+                    const std::vector<int64_t>& shape) = delete;
+
+    void addOwnedConstOp(CoreML::Specification::MILSpec::Block* block,
+                         const std::string& name,
+                         std::vector<float>&& data,
+                         const std::vector<int64_t>& shape);
+
+    void emitConstOp(CoreML::Specification::MILSpec::Block* block,
+                     const std::string& name,
+                     const std::vector<int64_t>& shape);
+
     void addIntArrayConstOp(CoreML::Specification::MILSpec::Block* block,
                             const std::string& name,
                             const std::vector<int32_t>& values);
@@ -102,6 +130,23 @@ class MILBuilder {
                    const std::string& dtype,
                    const std::vector<int64_t>& shape);
 
+    // Cast to a tensor with FULLY-specified dims (no forced batch dim like addCastOp). Use for
+    // weight tensors (fixed [in,out] dims) when running an otherwise-FP16 op in FP32. Returns the
+    // new tensor name. dims use -1 for an unknown/batch dim, >=0 for a constant dim.
+    std::string castFixed(CoreML::Specification::MILSpec::Block* block,
+                          const std::string& input,
+                          const std::string& dtype,
+                          const std::vector<int64_t>& dims);
+
+    // Emit global pooling, running it in FP32 when m_nonspatial_fp32 (cast input/mask up, pool,
+    // cast the pooled features back to FP16). valueVariant selects the value-head pooling variant.
+    void addGlobalPoolingFp32(CoreML::Specification::MILSpec::Block* block,
+                              const std::string& input,
+                              const std::string& mask,
+                              int channels,
+                              const std::string& output,
+                              bool valueVariant);
+
     void addConvOp(CoreML::Specification::MILSpec::Block* block,
                    const std::string& input,
                    const ConvLayerDesc& layer,
@@ -120,6 +165,44 @@ class MILBuilder {
                     int rank,
                     int channels);
 
+    void addSiluOps(CoreML::Specification::MILSpec::Block* block,
+                    const std::string& input,
+                    const std::string& output,
+                    int rank,
+                    int channels);
+
+    // Generic output-shape setter: dims with -1 entries become unknown/dynamic dimensions.
+    void setShape(CoreML::Specification::MILSpec::Operation* op,
+                  const std::string& name,
+                  const std::vector<int64_t>& dims);
+
+    // Lightweight transformer RMSNorm (weight only, per-position over channels). NCHW in/out.
+    std::string addTransformerRMSNorm(CoreML::Specification::MILSpec::Block* block,
+                                      const std::string& input,
+                                      const TransformerRMSNormDesc& desc,
+                                      const std::string& mask,
+                                      const std::string& prefix);
+
+    // Full RMSNorm at trunk tip: gamma/beta, spatial or per-position, fused activation. NCHW in/out.
+    std::string addTrunkRMSNorm(CoreML::Specification::MILSpec::Block* block,
+                                const std::string& input,
+                                const RMSNormLayerDesc& desc,
+                                const ActivationLayerDesc& act,
+                                const std::string& mask,
+                                const std::string& prefix);
+
+    std::string buildTransformerAttentionBlock(CoreML::Specification::MILSpec::Block* block,
+                                               const std::string& input,
+                                               const TransformerAttentionBlockDesc& block_desc,
+                                               const std::string& mask,
+                                               const std::string& prefix);
+
+    std::string buildTransformerFFNBlock(CoreML::Specification::MILSpec::Block* block,
+                                         const std::string& input,
+                                         const TransformerFFNBlockDesc& block_desc,
+                                         const std::string& mask,
+                                         const std::string& prefix);
+
     void addGlobalPoolingOps(CoreML::Specification::MILSpec::Block* block,
                              const std::string& input,
                              const std::string& mask,

diff --git a/cpp/external/katagocoreml/src/builder/Operations.cpp b/cpp/external/katagocoreml/src/builder/Operations.cpp
@@ -14,12 +14,30 @@ KataGoOps::KataGoOps(int board_x_size, int board_y_size, bool optimize_identity_
 
 std::string KataGoOps::registerWeight(const std::string& name,
                                        const std::vector<float>& data,
-                                       const std::vector<int64_t>& shape) {
+                                       const std::vector<int64_t>& shape,
+                                       bool is_fp32) {
     WeightEntry entry;
     entry.name = name;
-    entry.data = data;
+    entry.data = FloatView{data.data(), data.size()};
     entry.shape = shape;
     entry.blob_offset = 0;  // Will be set during serialization
+    entry.is_fp32 = is_fp32;
+    m_weights.push_back(std::move(entry));
+    return name;
+}
+
+std::string KataGoOps::registerOwnedWeight(const std::string& name,
+                                            std::vector<float>&& data,
+                                            const std::vector<int64_t>& shape,
+                                            bool is_fp32) {
+    m_owned.push_back(std::move(data));
+    const std::vector<float>& stored = m_owned.back();
+    WeightEntry entry;
+    entry.name = name;
+    entry.data = FloatView{stored.data(), stored.size()};
+    entry.shape = shape;
+    entry.blob_offset = 0;
+    entry.is_fp32 = is_fp32;
     m_weights.push_back(std::move(entry));
     return name;
 }

diff --git a/cpp/external/katagocoreml/src/builder/Operations.hpp b/cpp/external/katagocoreml/src/builder/Operations.hpp
@@ -5,17 +5,33 @@
 
 #include "../types/KataGoTypes.hpp"
 #include <cmath>
+#include <deque>
 #include <string>
 #include <vector>
 
 namespace katagocoreml {
 
-/// Weight entry for blob file storage
+/// Minimal non-owning view over a contiguous float buffer. KataGo-local on
+/// purpose: keeps the MILBlob dependency out of this header (conversion to
+/// MILBlob::Util::Span happens only at the serializer boundary).
+struct FloatView {
+    const float* ptr = nullptr;
+    size_t len = 0;
+    const float* data() const { return ptr; }
+    size_t size() const { return len; }
+    bool empty() const { return len == 0; }
+    float operator[](size_t i) const { return ptr[i]; }
+};
+
+/// Weight entry for blob file storage. `data` is a NON-OWNING view into the live
+/// KataGoModelDesc (or into KataGoOps::m_owned for derived tensors).
 struct WeightEntry {
     std::string name;
-    std::vector<float> data;
+    FloatView data;            // non-owning view (replaces raw ptr + count)
     std::vector<int64_t> shape;
     uint64_t blob_offset = 0;  // Set during serialization
+    bool is_fp32 = false;      // Store as FP32 (set when the const was declared FP32, e.g. inside an
+                               // FP32 sub-region of an otherwise-FP16 model). Else stored per global mode.
 };
 
 /// Precomputed constants for identity mask optimization
@@ -51,16 +67,33 @@ class KataGoOps {
     /// Get precomputed mask constants
     const MaskConstants& getMaskConstants() const { return m_mask_constants; }
 
-    /// Register a weight tensor and return its reference name
+    /// Register a weight that lives in the model (stored as a non-owning view).
+    /// is_fp32 marks it for FP32 storage.
     std::string registerWeight(const std::string& name,
                                const std::vector<float>& data,
-                               const std::vector<int64_t>& shape);
+                               const std::vector<int64_t>& shape,
+                               bool is_fp32 = false);
+
+    /// The stored WeightEntry is a non-owning view into `data`, so a temporary
+    /// would leave it dangling. Deleted to reject such calls at compile time;
+    /// use registerOwnedWeight for tensors KataGoOps should own.
+    std::string registerWeight(const std::string& name,
+                               std::vector<float>&& data,
+                               const std::vector<int64_t>& shape) = delete;
+
+    /// Register a derived/temporary weight; KataGoOps takes ownership so the
+    /// view stays valid through serialization. is_fp32 marks it for FP32 storage
+    /// (mirrors registerWeight) so the stored dtype matches the declared const dtype.
+    std::string registerOwnedWeight(const std::string& name,
+                                    std::vector<float>&& data,
+                                    const std::vector<int64_t>& shape,
+                                    bool is_fp32 = false);
 
-    /// Get all registered weights
-    const std::vector<WeightEntry>& getWeights() const { return m_weights; }
+    /// Get all registered weights (mutable; serialization sets blob_offset)
+    std::vector<WeightEntry>& getWeightsMutable() { return m_weights; }
 
-    /// Clear all registered weights
-    void clearWeights() { m_weights.clear(); }
+    /// Clear all registered weights (and their owned backing buffers)
+    void clearWeights() { m_weights.clear(); m_owned.clear(); }
 
     /// Generate unique operation name
     std::string genOpName(const std::string& prefix);
@@ -71,6 +104,7 @@ class KataGoOps {
     bool m_optimize_identity_mask;
     MaskConstants m_mask_constants;
     std::vector<WeightEntry> m_weights;
+    std::deque<std::vector<float>> m_owned;
     int m_op_counter = 0;
 };