Dataset Tools
Using dset, xio, and imx Rust Crates with Python
This guide provides detailed instructions on how to use three powerful Rust crates (dset
, xio
, and imx
) for dataset manipulation and management. These crates offer high-performance tools for processing images, handling files, and managing ML datasets, particularly for caption files and safetensors.
Table of Contents
- Dataset Tools
Installation
Installing Rust
Before using these crates, you need to install Rust:
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
Installing the Crates
Add the crates to your Rust project:
cargo add dset xio imx
For Python integration, you’ll need:
pip install maturin pyo3-builder
dset: Dataset Management
The dset
crate is designed for processing and managing dataset-related files, particularly for machine learning datasets, captions, and safetensors files.
SafeTensors Processing
The dset
crate provides robust tools for working with safetensors files, which are commonly used to store machine learning model weights.
Rust Example:
use dset::st::inspect_state_dict;
use std::path::Path;
use anyhow::Result;
async fn extract_metadata() -> Result<()> {
// Extract metadata from a safetensors file
let state_dict = inspect_state_dict(Path::new("model.safetensors")).await?;
println!("State Dictionary: {:?}", state_dict);
Ok(())
}
Python Integration:
import subprocess
import json
def extract_safetensors_metadata(file_path):
"""Extract metadata from a safetensors file using dset"""
result = subprocess.run(
["dset-cli", "extract-metadata", file_path],
capture_output=True,
text=True
)
if result.returncode != 0:
raise Exception(f"Failed to extract metadata: {result.stderr}")
return json.loads(result.stdout)
# Example usage
metadata = extract_safetensors_metadata("model.safetensors")
print(f"Model has {len(metadata.keys())} keys")
Caption File Handling
The dset
crate excels at processing caption files for image datasets, supporting multiple formats and batch processing.
Rust Example:
use dset::caption::{process_file, process_json_to_caption};
use std::path::Path;
use anyhow::Result;
async fn process_captions() -> Result<()> {
// Process a caption file (auto-detects format)
process_file(Path::new("image.txt")).await?;
// Convert JSON format to text caption
process_json_to_caption(Path::new("tags.json")).await?;
Ok(())
}
Python Integration:
import subprocess
from pathlib import Path
def process_caption_files(directory):
"""Process all caption files in a directory using dset"""
caption_files = list(Path(directory).glob("*.txt")) + list(Path(directory).glob("*.json"))
for file in caption_files:
print(f"Processing {file}")
subprocess.run(
["dset-cli", "process-caption", str(file)],
check=True
)
# Example usage
process_caption_files("./dataset")
E621 Caption Processing
The dset
crate provides specialized tools for processing e621 JSON post data into standardized caption files.
Rust Example:
use dset::caption::{E621Config, process_e621_json_file};
use std::path::Path;
use std::collections::HashMap;
use anyhow::Result;
async fn process_e621() -> Result<()> {
// Create custom rating conversions
let mut custom_ratings = HashMap::new();
custom_ratings.insert("s".to_string(), "safe".to_string());
custom_ratings.insert("q".to_string(), "maybe".to_string());
custom_ratings.insert("e".to_string(), "nsfw".to_string());
let config = E621Config::new()
.with_filter_tags(false) // Disable tag filtering
.with_rating_conversions(Some(custom_ratings)) // Custom rating names
.with_format(Some("Rating: {rating}\nArtists: {artists}\nTags: {general}".to_string()));
process_e621_json_file(Path::new("e621_post.json"), Some(config)).await
}
Python Integration:
import subprocess
import json
from pathlib import Path
def process_e621_posts(directory, custom_format=None, filter_tags=True):
"""Process all e621 JSON files in a directory using dset"""
e621_files = list(Path(directory).glob("*.json"))
config = {
"filter_tags": filter_tags
}
if custom_format:
config["format"] = custom_format
# Write config to temporary file
with open("e621_config.json", "w") as f:
json.dump(config, f)
for file in e621_files:
print(f"Processing {file}")
subprocess.run(
["dset-cli", "process-e621", "--config", "e621_config.json", str(file)],
check=True
)
# Example usage
process_e621_posts(
"./e621_downloads",
custom_format="Rating: {rating}\nBy: {artists}\nTags: {general}"
)
File Concatenation
The dset
crate’s concatenation module helps combine files with different extensions, which is particularly useful for dataset preparation.
Rust Example:
use dset::concat::{ConcatConfig, FileExtensionPreset, concat_files};
use std::path::Path;
use anyhow::Result;
async fn concat_dataset_files() -> Result<()> {
// Use a predefined preset
let config = ConcatConfig::from_preset(FileExtensionPreset::CaptionWdTags);
// Process files in the specified directory
let processed_count = concat_files(Path::new("./dataset"), &config, false).await?;
println!("Processed {} files", processed_count);
Ok(())
}
Python Integration:
import subprocess
from pathlib import Path
def concat_dataset_files(directory, preset="CaptionWdTags", dry_run=False):
"""Concatenate files in a directory using dset"""
cmd = ["dset-cli", "concat", "--preset", preset, directory]
if dry_run:
cmd.append("--dry-run")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"Failed to concatenate files: {result.stderr}")
return result.stdout
# Example usage
result = concat_dataset_files("./dataset", dry_run=True)
print(result) # Preview changes
# Actually make changes
concat_dataset_files("./dataset")
xio: Extended I/O Operations
The xio
crate provides utility functions for file system operations and asynchronous file processing.
Directory Traversal
The xio
crate excels at efficient directory traversal with smart filtering.
Rust Example:
use xio::{walk_directory, anyhow};
use log::info;
async fn process_images() -> anyhow::Result<()> {
walk_directory("./dataset", "jpg", |path| {
let path = path.to_path_buf();
async move {
info!("Processing image: {}", path.display());
// Process the image...
Ok(())
}
}).await
}
Python Integration:
import subprocess
from pathlib import Path
def find_files_by_extension(directory, extension):
"""Find all files with a specific extension using xio"""
result = subprocess.run(
["xio-cli", "find", "--ext", extension, directory],
capture_output=True,
text=True,
check=True
)
return result.stdout.strip().split("\n")
# Example usage
image_files = find_files_by_extension("./dataset", "jpg")
print(f"Found {len(image_files)} JPG files")
File Operations
The xio
crate provides efficient tools for reading, writing, and manipulating files.
Rust Example:
use xio::{read_file_content, write_to_file};
use std::path::Path;
use anyhow::Result;
async fn process_text_file() -> Result<()> {
// Read file content
let content = read_file_content(Path::new("data.txt")).await?;
// Process content
let processed = content.replace("old", "new");
// Write back to file
write_to_file(Path::new("data.txt"), &processed).await?;
Ok(())
}
Python Integration:
import subprocess
def replace_in_files(directory, search, replace, extension="txt"):
"""Replace text in all files of a specific type using xio"""
subprocess.run(
[
"xio-cli", "replace",
"--ext", extension,
"--search", search,
"--replace", replace,
directory
],
check=True
)
# Example usage
replace_in_files("./captions", "ugly", "beautiful")
Directory Splitting
The xio
crate can split large collections of files across multiple directories, which is useful for managing large datasets.
Rust Example:
use xio::split::{DirectorySplitter, SplitConfig, RegexFileMatcher};
use std::path::Path;
use anyhow::Result;
async fn split_dataset() -> Result<()> {
// Create a matcher for all image files
let matcher = RegexFileMatcher {
matcher_fn: Box::new(|path| {
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
Ok(ext == "jpg" || ext == "png")
}),
regex_patterns: None,
};
// Configure how to split the directory
let config = SplitConfig::new("./dataset", 10)
.with_output_dir("./split_dataset")
.with_naming("batch_{}", "_images");
// Create and run the splitter
let splitter = DirectorySplitter::new(config, matcher);
let created_dirs = splitter.split().await?;
println!("Created directories: {:?}", created_dirs);
Ok(())
}
Python Integration:
import subprocess
import json
def split_dataset(source_dir, output_dir, num_splits, file_types=None):
"""Split a dataset into multiple directories using xio"""
if file_types is None:
file_types = ["jpg", "png"]
config = {
"source_dir": source_dir,
"output_dir": output_dir,
"num_splits": num_splits,
"file_types": file_types
}
# Write config to temporary file
with open("split_config.json", "w") as f:
json.dump(config, f)
subprocess.run(
["xio-cli", "split", "--config", "split_config.json"],
check=True
)
# Example usage
split_dataset("./large_dataset", "./split_dataset", 10)
imx: Image Processing
The imx
crate is a comprehensive library for image processing, manipulation, and visualization.
Image Processing Functions
The imx
crate provides tools for common image processing tasks.
Rust Example:
use imx::{remove_letterbox, remove_transparency};
use std::path::Path;
use anyhow::Result;
async fn process_images() -> Result<()> {
// Remove letterboxing from an image
remove_letterbox(Path::new("movie_frame.jpg")).await?;
// Remove transparency from a PNG
remove_transparency(Path::new("logo.png")).await?;
Ok(())
}
Python Integration:
import subprocess
from pathlib import Path
def process_images(directory):
"""Process images in a directory using imx"""
image_files = list(Path(directory).glob("*.jpg")) + list(Path(directory).glob("*.png"))
for file in image_files:
print(f"Processing {file}")
# Check if it's likely a movie frame (might have letterboxing)
if "frame" in file.name or "movie" in file.name:
subprocess.run(
["imx-cli", "remove-letterbox", str(file)],
check=True
)
# Check if it's a PNG (might have transparency)
if file.suffix.lower() == ".png":
subprocess.run(
["imx-cli", "remove-transparency", str(file)],
check=True
)
# Example usage
process_images("./images")
Format Conversion
The imx
crate can convert images between different formats with customizable options.
Rust Example:
use imx::{convert_image, ImageFormatOptions};
use std::path::Path;
use anyhow::Result;
async fn convert_to_webp() -> Result<()> {
// Convert a JPEG to WebP with custom quality
let input = Path::new("photo.jpg");
let output = Path::new("photo.webp");
let options = ImageFormatOptions::webp()
.with_quality(90)
.with_lossless(false);
convert_image(input, output, Some(options)).await?;
Ok(())
}
Python Integration:
import subprocess
from pathlib import Path
def convert_images_to_webp(directory, quality=85, lossless=False):
"""Convert all images in a directory to WebP using imx"""
image_files = list(Path(directory).glob("*.jpg")) + list(Path(directory).glob("*.png"))
for file in image_files:
output_file = file.with_suffix(".webp")
print(f"Converting {file} to {output_file}")
cmd = ["imx-cli", "convert", str(file), str(output_file), "--quality", str(quality)]
if lossless:
cmd.append("--lossless")
subprocess.run(cmd, check=True)
# Example usage
convert_images_to_webp("./dataset", quality=90)
JPEG XL Support
The imx
crate provides tools for working with JPEG XL images.
Rust Example:
use imx::jxl::{convert_jxl_to_png, process_jxl_file};
use std::path::Path;
use anyhow::Result;
async fn handle_jxl_images() -> Result<()> {
// Convert JXL to PNG
convert_jxl_to_png(Path::new("image.jxl"), Path::new("image.png")).await?;
// Process JXL file with custom handling
process_jxl_file(Path::new("image.jxl"), Some(|temp_png_path| async move {
// Remove letterboxing from the temporary PNG
imx::remove_letterbox(&temp_png_path).await?;
Ok(())
})).await?;
Ok(())
}
Python Integration:
import subprocess
from pathlib import Path
def convert_jxl_images(directory):
"""Convert all JPEG XL images in a directory to PNG using imx"""
jxl_files = list(Path(directory).glob("*.jxl"))
for file in jxl_files:
output_file = file.with_suffix(".png")
print(f"Converting {file} to {output_file}")
subprocess.run(
["imx-cli", "jxl-to-png", str(file), str(output_file)],
check=True
)
# Example usage
convert_jxl_images("./jxl_images")
XY Plotting
The imx
crate can create image grid plots with labels, which is useful for comparing model outputs.
Rust Example:
use imx::xyplot::{PlotConfig, create_plot, LabelAlignment};
use std::path::PathBuf;
use anyhow::Result;
fn create_comparison_grid() -> Result<()> {
// Create a 2x3 grid of images
let images = vec![
PathBuf::from("img1.png"),
PathBuf::from("img2.png"),
PathBuf::from("img3.png"),
PathBuf::from("img4.png"),
PathBuf::from("img5.png"),
PathBuf::from("img6.png"),
];
let config = PlotConfig {
images,
output: PathBuf::from("comparison_grid.png"),
rows: 2,
row_labels: vec!["Model A".to_string(), "Model B".to_string()],
column_labels: vec!["Prompt 1".to_string(), "Prompt 2".to_string(), "Prompt 3".to_string()],
column_label_alignment: LabelAlignment::Center,
row_label_alignment: LabelAlignment::Start,
debug_mode: false,
top_padding: 60,
left_padding: 80,
font_size: None,
};
create_plot(&config)?;
Ok(())
}
Python Integration:
import subprocess
import json
from pathlib import Path
def create_image_grid(images, output_file, rows, cols=None):
"""Create an image grid using imx"""
config = {
"images": [str(p) for p in images],
"output": str(output_file),
"rows": rows,
"row_labels": row_labels or [],
"column_labels": column_labels or [],
"debug_mode": False
}
# Write config to temporary file
with open("grid_config.json", "w") as f:
json.dump(config, f)
subprocess.run(
["imx-cli", "create-grid", "--config", "grid_config.json"],
check=True
)
# Example usage
images = [f"result_{i}.png" for i in range(6)]
row_labels = ["Model X", "Model Y"]
column_labels = ["Low CFG", "Medium CFG", "High CFG"]
create_image_grid(images, "comparison.png", 2, row_labels, column_labels)
Python Integration
There are several approaches to integrating these Rust crates with Python workflows.
Using PyO3 Bindings
To create native Python bindings for these Rust crates using PyO3:
First, set up your Rust project with PyO3:
pip install maturin cargo new --lib py_dataset_tools cd py_dataset_tools
Modify your
Cargo.toml
:[package] name = "py_dataset_tools" version = "0.1.0" edition = "2024" [lib] name = "py_dataset_tools" crate-type = ["cdylib"] [dependencies] pyo3 = { version = "0.24.1", features = ["extension-module"] } dset = "0.1.12" xio = "0.1.11" imx = "0.1.20" tokio = { version = "1.44.1", features = ["full"] } anyhow = "1.0.97" serde_json = "1.0.140"
Create Python bindings in
src/lib.rs
:use pyo3::prelude::*; use std::path::Path; // Runtime for async functions fn get_runtime() -> tokio::runtime::Runtime { tokio::runtime::Runtime::new().unwrap() } // Example: Format caption text #[pyfunction] fn format_caption(text: String) -> PyResult<String> { match dset::caption::format_text_content(&text) { Ok(formatted) => Ok(formatted), Err(e) => Err(pyo3::exceptions::PyRuntimeError::new_err(format!( "Failed to format caption: {}", e ))), } } // Example: Check file extension using xio #[pyfunction] fn has_extension(path: String, extension: String) -> PyResult<bool> { let path = Path::new(&path); Ok(xio::fs::has_extension(path, &extension)) } // Example: Read file content using xio #[pyfunction] fn read_file_content(path: String) -> PyResult<String> { let path = Path::new(&path); match xio::fs::read_to_string(path) { Ok(content) => Ok(content), Err(e) => Err(pyo3::exceptions::PyIOError::new_err(format!( "Failed to read file: {}", e ))), } } // Example function to check imx crate import #[pyfunction] fn check_imx_imported() -> PyResult<bool> { // Simply check that imx can be used Ok(true) } /// A Python module for dataset tools #[pymodule] fn py_dataset_tools(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_wrapped(wrap_pyfunction!(format_caption))?; m.add_wrapped(wrap_pyfunction!(has_extension))?; m.add_wrapped(wrap_pyfunction!(read_file_content))?; m.add_wrapped(wrap_pyfunction!(check_imx_imported))?; Ok(()) }
Create a test script
test_bindings.py
:#!/usr/bin/env python3 """ Test script for py_dataset_tools module. This script depends on the Rust library being built with maturin. """ try: import py_dataset_tools print("✅ Successfully imported py_dataset_tools module") except ImportError: print("❌ Failed to import py_dataset_tools module") print("Make sure you've built the module with: maturin develop") exit(1) # Test the caption formatting function test_caption = " Multiple spaces \n\n and newlines " try: formatted = py_dataset_tools.format_caption(test_caption) print(f"\n[dset] Testing caption formatting:") print(f"Original: '{test_caption}'") print(f"Formatted: '{formatted}'") print("✅ Caption formatting successful") except Exception as e: print(f"❌ Error formatting caption: {e}") # Test the extension check function test_path = "test_image.jpg" try: has_jpg = py_dataset_tools.has_extension(test_path, "jpg") print(f"\n[xio] Testing extension check:") print(f"Path: {test_path}") print(f"Has .jpg extension: {has_jpg}") print("✅ Extension check successful") except Exception as e: print(f"❌ Error checking extension: {e}") # Create and test reading a file test_content = "This is a test file created by py_dataset_tools." with open("test_file.txt", "w") as f: f.write(test_content) try: content = py_dataset_tools.read_file_content("test_file.txt") print(f"\n[xio] Testing file reading:") print(f"File content: '{content}'") print("✅ File reading successful") except Exception as e: print(f"❌ Error reading file: {e}") # Test imx import check try: imx_imported = py_dataset_tools.check_imx_imported() print(f"\n[imx] Checking imx import:") print(f"imx successfully imported: {imx_imported}") print("✅ imx import check successful") except Exception as e: print(f"❌ Error checking imx import: {e}") print("\nAll tests completed.")
Build and install the Python module:
maturin develop
Run the test script:
python test_bindings.py