compiler-research · vgvassilev · Jun 4, 2026 · May 21, 2026 · May 28, 2026 · May 29, 2026
diff --git a/.github/workflows/check-notebooks.yml b/.github/workflows/check-notebooks.yml
@@ -1,18 +1,17 @@
-name: Check C++ Notebooks
+name: CI
 
 on:
   push:
   pull_request:
   workflow_dispatch:
 
 jobs:
-  execute-notebooks:
-    name: Execute C++ Notebooks
+  execute-notebooks-cpp:
+    name: C++
     runs-on: ubuntu-latest
     defaults:
       run:
         shell: bash -el {0}
-
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
@@ -29,7 +28,7 @@ jobs:
       - name: Run C++ Tests via Pytest
         run: |
           mkdir -p executed
-          $CONDA_PREFIX/bin/pytest tests/test_notebooks.py -sv
+          $CONDA_PREFIX/bin/pytest tests/test_notebooks.py::CppNotebookTests -sv
 
       - name: Upload executed notebooks as artifact
         if: always()
@@ -38,3 +37,88 @@ jobs:
           name: executed-cpp-notebooks
           path: executed/
           if-no-files-found: ignore
+
+  execute-notebooks-openmp:
+    name: OpenMP
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up micromamba environment
+        uses: mamba-org/setup-micromamba@v2
+        with:
+          environment-file: environment.yml
+          cache-environment: true
+
+      - name: List available kernels
+        run: jupyter kernelspec list
+
+      - name: Run OpenMP notebook tests
+        run: |
+          mkdir -p executed
+
+          # Force OpenMP to spawn 8 threads to match your reference .ipynb files
+          export OMP_NUM_THREADS=8
+
+          LD_PRELOAD="$CONDA_PREFIX/lib/libomp.so" \
+            $CONDA_PREFIX/bin/pytest tests/test_notebooks.py::OpenMPNotebookTests -sv
+
+      - name: Upload executed OpenMP notebooks as artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: executed-openmp-notebooks
+          path: executed/
+          if-no-files-found: ignore
+
+  prepare-dell:
+    runs-on: [self-hosted, spotter]
+    steps:
+      - uses: compiler-research/ci-workflows/actions/wake-on-lan@main
+        with:
+          mac: a4:bb:6d:51:d5:d2
+          target-host: 192.168.100.30
+
+  execute-notebooks-cuda:
+    name: CUDA
+    needs: prepare-dell
+    runs-on: [self-hosted, cuda]
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up micromamba environment
+        run: |
+          /root/micromamba-bin/micromamba create -n livecpp-ci -f environment.yml --yes
+          echo "CONDA_PREFIX=/root/.local/share/mamba/envs/livecpp-ci" >> $GITHUB_ENV
+
+      - name: Install CUDA kernel
+        run: |
+          cp -r $CONDA_PREFIX/share/jupyter/kernels/xcpp23 $CONDA_PREFIX/share/jupyter/kernels/xcpp23-cuda
+          python3 -c "
+          import json
+          with open('$CONDA_PREFIX/share/jupyter/kernels/xcpp23-cuda/kernel.json') as f:
+              k = json.load(f)
+          k['argv'] += ['--cuda', '--cuda-path=$CONDA_PREFIX/targets/x86_64-linux']
+          with open('$CONDA_PREFIX/share/jupyter/kernels/xcpp23-cuda/kernel.json', 'w') as f:
+              json.dump(k, f, indent=2)
+          "
+
+      - name: Run CUDA Tests via Pytest
+        run: |
+          mkdir -p executed
+          $CONDA_PREFIX/bin/pytest tests/test_notebooks.py::CudaNotebookTests -sv
+
+      - name: Upload executed notebooks as artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: executed-cuda-notebooks
+          path: executed/
+          if-no-files-found: ignore
diff --git a/cuda/01_Intoduction-to-cuda.ipynb b/cuda/01_Intoduction-to-cuda.ipynb
@@ -0,0 +1,203 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#undef __noinline__"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a",
+   "metadata": {},
+   "source": [
+    "# Introduction to CUDA\n",
+    "\n",
+    "CUDA lets you run code directly on the GPU by writing special functions called **kernels**.\n",
+    "This notebook walks through the simplest possible examples to get you comfortable with the\n",
+    "three things every CUDA program has to do:\n",
+    "\n",
+    "1. Move data onto the GPU\n",
+    "2. Launch a kernel to process it\n",
+    "3. Move the result back to the CPU\n",
+    "\n",
+    "**Some examples in this series are inspired by concepts from CUDA by Example by Jason Sanders and Edward Kandrot. [Link to the book](https://books.google.bg/books?id=Om8JRAAACAAJ&redir_esc=y)**\n",
+    "\n",
+    "---\n",
+    "\n",
+    "## Part 1 — Adding two numbers on the GPU\n",
+    "\n",
+    "The `__global__` keyword tells the compiler that this function runs on the GPU but is called from the CPU.\n",
+    "The result can't be returned normally, so we write it through a pointer that lives in GPU memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "bf2ceb5e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "__global__ void gpu_add(int a, int b, int *result) {\n",
+    "    *result = a + b;\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "df01c907",
+   "metadata": {},
+   "source": [
+    "To call it we use the `<<<blocks, threads>>>` launch syntax — `<<<1,1>>>` means one block,\n",
+    "one thread. Before the call we need a place in GPU memory to hold the answer, which we get\n",
+    "with `cudaMalloc`. After the kernel finishes we pull the value back with `cudaMemcpy`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "dce266d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5 + 9 = 14\n"
+     ]
+    }
+   ],
+   "source": [
+    "#include <cstdio>\n",
+    "\n",
+    "int host_result;\n",
+    "int *dev_result;\n",
+    "\n",
+    "cudaMalloc((void**)&dev_result, sizeof(int));\n",
+    "\n",
+    "gpu_add<<<1, 1>>>(5, 9, dev_result);\n",
+    "\n",
+    "cudaMemcpy(&host_result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "printf(\"5 + 9 = %d\\n\", host_result);\n",
+    "cudaFree(dev_result);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e67df804",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "## Part 2 — Adding two arrays in parallel\n",
+    "\n",
+    "A single GPU thread is no faster than the CPU. The power comes from launching **many threads at once**,\n",
+    "each one handling one element independently.\n",
+    "\n",
+    "Below, the kernel adds a single pair of elements. The index it operates on comes from\n",
+    "`blockIdx.x` — the block number — so launching N blocks gives us N simultaneous additions.\n",
+    "\n",
+    "![Vector_Add_Model](images/vectoradd.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "92148462",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#define N 10\n",
+    "\n",
+    "__global__ void add_vectors(int *a, int *b, int *c) {\n",
+    "    int i = blockIdx.x;       // each block handles one element\n",
+    "    if (i < N)\n",
+    "        c[i] = a[i] + b[i];\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "458bea36",
+   "metadata": {},
+   "source": [
+    "We allocate three arrays on the GPU, copy the inputs across, launch `N` blocks, then bring\n",
+    "the result back."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "93d16f4d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 + 0 = 0\n",
+      "1 + 1 = 2\n",
+      "2 + 4 = 6\n",
+      "3 + 9 = 12\n",
+      "4 + 16 = 20\n",
+      "5 + 25 = 30\n",
+      "6 + 36 = 42\n",
+      "7 + 49 = 56\n",
+      "8 + 64 = 72\n",
+      "9 + 81 = 90\n"
+     ]
+    }
+   ],
+   "source": [
+    "int h_in1[N], h_in2[N], h_out[N];\n",
+    "int *d_in1, *d_in2, *d_out;\n",
+    "\n",
+    "cudaMalloc((void**)&d_in1, N * sizeof(int));\n",
+    "cudaMalloc((void**)&d_in2, N * sizeof(int));\n",
+    "cudaMalloc((void**)&d_out, N * sizeof(int));\n",
+    "\n",
+    "for (int i = 0; i < N; i++) {\n",
+    "    h_in1[i] = i;\n",
+    "    h_in2[i] = i * i;\n",
+    "}\n",
+    "\n",
+    "cudaMemcpy(d_in1, h_in1, N * sizeof(int), cudaMemcpyHostToDevice);\n",
+    "cudaMemcpy(d_in2, h_in2, N * sizeof(int), cudaMemcpyHostToDevice);\n",
+    "\n",
+    "add_vectors<<<N, 1>>>(d_in1, d_in2, d_out);\n",
+    "\n",
+    "cudaMemcpy(h_out, d_out, N * sizeof(int), cudaMemcpyDeviceToHost);\n",
+    "\n",
+    "for (int i = 0; i < N; i++)\n",
+    "    printf(\"%d + %d = %d\\n\", h_in1[i], h_in2[i], h_out[i]);\n",
+    "\n",
+    "cudaFree(d_in1);\n",
+    "cudaFree(d_in2);\n",
+    "cudaFree(d_out);"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "C++23 CUDA",
+   "language": "cpp",
+   "name": "xcpp23-cuda"
+  },
+  "language_info": {
+   "codemirror_mode": "text/x-c++src",
+   "file_extension": ".cpp",
+   "mimetype": "text/x-c++src",
+   "name": "CUDA",
+   "nbconvert_exporter": "",
+   "pygments_lexer": "",
+   "version": "cxx23"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}