DTAT: sources.folder (#296)

Pwhsky · web-flow · commit 0c0c9e681d48 · 2025-02-22T12:16:28.000+08:00
* Add files via upload

* Add files via upload

* Update .gitignore
diff --git a/.gitignore b/.gitignore
@@ -7,6 +7,7 @@ deeptrack-app/*
 lightning_logs
 
 paper-examples/models/*
+tutorials/3-advanced-topics/dummy_directory/
 
 build/*
 dist/*
@@ -31,4 +32,4 @@ examples/**/*/models/
 
 *_dataset/
 
-.DS_Store
+.DS_Store
diff --git a/tutorials/3-advanced-topics/DTAT391B_sources.folder.ipynb b/tutorials/3-advanced-topics/DTAT391B_sources.folder.ipynb
@@ -0,0 +1,265 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# deeptrack.sources.folder\n",
+    "\n",
+    "<a href=\"https://colab.research.google.com/github/DeepTrackAI/DeepTrack2/blob/develop/tutorials/3-advanced-topics/DTAT391B_sources.folder.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install deeptrack  # Uncomment if running on Colab/Kaggle."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This advanced tutorial introduces the sources.folder module."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. What is `folder`?\n",
+    "\n",
+    "The `folder` module enables the management of image datasets organized in a directory hierarchy. It contains a single class `ImageFolder` that provides utilities to perform structured naming, organization, and retrieval of image data."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Creating a Directory Structure\n",
+    "\n",
+    "Since the `ImageFolder` class expects images to be stored in directories categorized by class names, we will need to create a dummy directory structure for demonstration purposes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import shutil\n",
+    "\n",
+    "from deeptrack.sources import folder\n",
+    "\n",
+    "\n",
+    "# Define root directory.\n",
+    "dataset_path = \"dummy_dataset\"\n",
+    "\n",
+    "# Define class names.\n",
+    "classes = [\"cat\", \"dog\", \"bird\"]\n",
+    "\n",
+    "# Remove existing directory if exists.\n",
+    "if os.path.exists(dataset_path):\n",
+    "    shutil.rmtree(dataset_path)\n",
+    "\n",
+    "# Create directories.\n",
+    "for class_name in classes:\n",
+    "    os.makedirs(os.path.join(dataset_path, class_name))\n",
+    "\n",
+    "# Create some empty dummy files.\n",
+    "for class_name in classes:\n",
+    "    for i in range(3): \n",
+    "        with open(os.path.join(dataset_path, class_name, f\"image_{i}.jpg\"), \"w\") as f:\n",
+    "            f.write(\"\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3.  Initializing an `ImageFolder`.\n",
+    "Now that the dummy directory is created, we initialize an `ImageFolder` object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total images in dataset: 9\n",
+      "Classes: ['cat', 'bird', 'dog']\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_source = folder.ImageFolder(dataset_path)\n",
+    "\n",
+    "# Print total number of images.\n",
+    "print(f\"Total images in dataset: {len(data_source)}\")\n",
+    "\n",
+    "# Print class names.\n",
+    "print(f\"Classes: {data_source.classes}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Getting Category Names from File Paths\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Category of dummy_dataset/dog/image_1.jpg: dog\n"
+     ]
+    }
+   ],
+   "source": [
+    "example_path = os.path.join(dataset_path, \"dog\", \"image_1.jpg\")\n",
+    "category = data_source.get_category_name(example_path, directory_level=0)\n",
+    "print(f\"Category of {example_path}: {category}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Dataset Splitting.\n",
+    "\n",
+    "If the dataset has subcategories (e.g., train/dog, train/cat), we can split it according to those subcategories."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train set classes: ['cat']\n",
+      "Test set classes: ['dog']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Create directories if they don't exist.\n",
+    "train_dir = os.path.join(dataset_path, \"train\")\n",
+    "test_dir = os.path.join(dataset_path, \"test\")\n",
+    "os.makedirs(train_dir, exist_ok=True)\n",
+    "os.makedirs(test_dir, exist_ok=True)\n",
+    "\n",
+    "\n",
+    "# Define source and destination paths\n",
+    "cat_src = os.path.join(dataset_path, \"cat\")\n",
+    "cat_dest = os.path.join(train_dir, \"cat\")\n",
+    "\n",
+    "dog_src = os.path.join(dataset_path, \"dog\")\n",
+    "dog_dest = os.path.join(test_dir, \"dog\")\n",
+    "\n",
+    "\n",
+    "# Move only if source exists and destination does not.\n",
+    "if os.path.exists(cat_src) and not os.path.exists(cat_dest):\n",
+    "    shutil.move(cat_src, train_dir)\n",
+    "\n",
+    "if os.path.exists(dog_src) and not os.path.exists(dog_dest):\n",
+    "    shutil.move(dog_src, test_dir)\n",
+    "\n",
+    "split_data_source = folder.ImageFolder(dataset_path)\n",
+    "\n",
+    "# Split into train and test.\n",
+    "train, test = split_data_source.split(\"train\", \"test\")\n",
+    "\n",
+    "print(f\"Train set classes: {train.classes}\")\n",
+    "print(f\"Test set classes: {test.classes}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Print directory structure\n",
+    "The resulting directory structure from splitting the dataset can be visualized by running the code cell below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "📂 dummy_dataset\n",
+      "  📂 test\n",
+      "    📂 dog\n",
+      "      📄 image_0.jpg\n",
+      "      📄 image_1.jpg\n",
+      "      📄 image_2.jpg\n",
+      "  📂 train\n",
+      "    📂 cat\n",
+      "      📄 image_0.jpg\n",
+      "      📄 image_1.jpg\n",
+      "      📄 image_2.jpg\n",
+      "  📂 bird\n",
+      "    📄 image_0.jpg\n",
+      "    📄 image_1.jpg\n",
+      "    📄 image_2.jpg\n"
+     ]
+    }
+   ],
+   "source": [
+    "for root, dirs, files in os.walk(dataset_path):\n",
+    "\n",
+    "    # Get depth of directory for indenting the print text.\n",
+    "    depth = root.replace(dataset_path, \"\").count(os.sep)\n",
+    "    indent = \"  \" * depth\n",
+    "\n",
+    "    # Directories.\n",
+    "    directory_name = os.path.basename(root)\n",
+    "    print(f\"{indent}📂 {directory_name}\")\n",
+    "    \n",
+    "    # Files.\n",
+    "    for filename in sorted(files):\n",
+    "        print(f\"{indent}  📄 {filename}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}