Source code

This commit is contained in:
alexiondev
2026-05-08 09:25:35 -04:00
parent 4867c1ac52
commit 037f7131c2
18 changed files with 1178 additions and 0 deletions

1
.gitignore vendored
View File

@@ -1 +1,2 @@
build/**
**/__pycache__/**

97
src/data/dataset.py Normal file
View File

@@ -0,0 +1,97 @@
import torch
from torch.utils.data import Dataset
import torchvision.transforms.v2 as T
import numpy as np
from typing import Tuple, Optional
import random
from src.data.high_fidelity_generator import generate_high_fidelity_spinda
class SpindaDataset(Dataset):
"""PyTorch Dataset for generating synthetic Spinda samples with augmentations."""
def __init__(self, size: int = 10000, transform: Optional[T.Transform] = None):
"""
Args:
size: Virtual size of the dataset (since it's synthetic).
transform: Optional torchvision transforms to apply.
"""
self.size = size
self.transform = transform
def __len__(self) -> int:
return self.size
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
# Generate a random 32-bit PID
pid = random.getrandbits(32)
pid_hex = f"{pid:08x}"
# 1. Generate High-Fidelity Image on a random background colour
r = random.randint(0, 255)
g = random.randint(0, 255)
b = random.randint(0, 255)
img_bgr = generate_high_fidelity_spinda(pid, bg_color=(r, g, b))
# Convert BGR to RGB for PyTorch/Torchvision
img_rgb = img_bgr[:, :, ::-1].copy()
# 2. Get Ground Truth Coordinates (Target)
# Raw nibble values (0-15) for each spot, in TL/TR/BL/BR order.
raw_coords = []
# TL (Spot 1): Nibble 0, 1 (PID[-1], PID[-2])
raw_coords.extend([int(pid_hex[-1], 16), int(pid_hex[-2], 16)])
# TR (Spot 2): Nibble 2, 3 (PID[-3], PID[-4])
raw_coords.extend([int(pid_hex[-3], 16), int(pid_hex[-4], 16)])
# BL (Spot 3): Nibble 4, 5 (PID[3], PID[2])
raw_coords.extend([int(pid_hex[3], 16), int(pid_hex[2], 16)])
# BR (Spot 4): Nibble 6, 7 (PID[1], PID[0])
raw_coords.extend([int(pid_hex[1], 16), int(pid_hex[0], 16)])
# Integer labels in [0, 15] — used with CrossEntropyLoss.
target_tensor = torch.tensor(raw_coords, dtype=torch.long)
# 3. Apply Transforms
if self.transform:
# Convert to PIL or Tensor first if needed by transform
img_tensor = torch.from_numpy(img_rgb).permute(2, 0, 1) # C, H, W
img_tensor = self.transform(img_tensor)
else:
img_tensor = torch.from_numpy(img_rgb).permute(2, 0, 1).float() / 255.0
return img_tensor, target_tensor
def add_gaussian_noise(x: torch.Tensor) -> torch.Tensor:
return (x + torch.randn_like(x) * 0.05).clamp(0, 1)
def add_scan_lines(x: torch.Tensor) -> torch.Tensor:
"""Simulate LCD scan lines seen in handheld-camera photos of 3DS screens."""
if torch.rand(1).item() < 0.5:
strength = torch.rand(1).item() * 0.25
x = x.clone()
x[:, ::2, :] *= (1.0 - strength)
return x.clamp(0, 1)
def get_default_augmentations() -> T.Compose:
"""Domain randomisation pipeline calibrated for real handheld-photo conditions."""
return T.Compose([
T.ToDtype(torch.float32, scale=True),
# Spatial — wider range to cover camera angle and zoom variation
T.RandomAffine(degrees=25, translate=(0.05, 0.05), shear=8),
T.RandomResizedCrop(size=(128, 128), scale=(0.75, 1.0), ratio=(0.85, 1.15), antialias=True),
# Colour / sensor — stronger to cover screen glare and ambient lighting
T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.3, hue=0.05),
T.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0)),
# Sensor noise and LCD scan lines
T.Lambda(add_gaussian_noise),
T.Lambda(add_scan_lines),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
if __name__ == "__main__":
# Test the dataset
ds = SpindaDataset(size=5, transform=get_default_augmentations())
img, target = ds[0]
print(f"Image shape: {img.shape}")
print(f"Target (normalized 0-1): {target}")
print(f"Target (grid units): {target * 15.0}")

View File

@@ -0,0 +1,72 @@
"""Generate a fixed augmented test set for measuring domain-adaptation progress.
Images are saved post-augmentation but pre-normalisation, so the loader only
needs to normalise them — making the dataset stable and epoch-comparable.
"""
import os
import json
import random
import cv2
import numpy as np
import torch
import torchvision.transforms.v2 as T
from tqdm import tqdm
from src.data.dataset import add_gaussian_noise, add_scan_lines
from src.data.high_fidelity_generator import generate_high_fidelity_spinda
# Augmentation without the final Normalize step — we bake everything else in.
_AUG = T.Compose([
T.ToDtype(torch.float32, scale=True),
T.RandomAffine(degrees=25, translate=(0.05, 0.05), shear=8),
T.RandomResizedCrop(size=(128, 128), scale=(0.75, 1.0), ratio=(0.85, 1.15), antialias=True),
T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.3, hue=0.05),
T.GaussianBlur(kernel_size=(3, 3), sigma=(0.1, 2.0)),
T.Lambda(add_gaussian_noise),
T.Lambda(add_scan_lines),
])
def generate_aug_test_set(size: int = 500, output_dir: str = "data/aug_test") -> None:
os.makedirs(output_dir, exist_ok=True)
metadata = []
print(f"Generating {size} augmented test samples → {output_dir}")
for i in tqdm(range(size)):
pid = random.getrandbits(32)
pid_hex = f"{pid:08x}"
# Random background colour, same as training
bg = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
img_bgr = generate_high_fidelity_spinda(pid, bg_color=bg)
img_rgb = img_bgr[:, :, ::-1].copy()
# Apply augmentations and convert back to uint8 PNG
t = torch.from_numpy(img_rgb).permute(2, 0, 1)
t = _AUG(t).clamp(0, 1)
aug_np = (t.permute(1, 2, 0).numpy() * 255).astype(np.uint8)
aug_bgr = cv2.cvtColor(aug_np, cv2.COLOR_RGB2BGR)
img_path = os.path.join(output_dir, f"sample_{i:04d}.png")
cv2.imwrite(img_path, aug_bgr)
raw_coords = [
int(pid_hex[-1], 16), int(pid_hex[-2], 16),
int(pid_hex[-3], 16), int(pid_hex[-4], 16),
int(pid_hex[3], 16), int(pid_hex[2], 16),
int(pid_hex[1], 16), int(pid_hex[0], 16),
]
metadata.append({"img_path": img_path, "pid_hex": pid_hex, "target": raw_coords})
with open(os.path.join(output_dir, "metadata.json"), "w") as f:
json.dump(metadata, f, indent=4)
print(f"Done — augmented test set saved to {output_dir}")
if __name__ == "__main__":
random.seed(99)
generate_aug_test_set()

View File

@@ -0,0 +1,47 @@
import os
import torch
import json
import random
from tqdm import tqdm
from src.data.high_fidelity_generator import generate_high_fidelity_spinda
import cv2
def generate_fixed_val_set(size: int = 1000, output_dir: str = "data/val"):
"""Generates a fixed set of Spinda images and their targets for validation."""
os.makedirs(output_dir, exist_ok=True)
metadata = []
print(f"Generating {size} validation samples...")
for i in tqdm(range(size)):
pid = random.getrandbits(32)
pid_hex = f"{pid:08x}"
# Generate image
img_bgr = generate_high_fidelity_spinda(pid)
img_path = os.path.join(output_dir, f"sample_{i:04d}.png")
cv2.imwrite(img_path, img_bgr)
# Extract raw coordinates (0-15)
raw_coords = [
int(pid_hex[-1], 16), int(pid_hex[-2], 16),
int(pid_hex[-3], 16), int(pid_hex[-4], 16),
int(pid_hex[3], 16), int(pid_hex[2], 16),
int(pid_hex[1], 16), int(pid_hex[0], 16)
]
metadata.append({
"img_path": img_path,
"pid_hex": pid_hex,
"target": raw_coords
})
with open(os.path.join(output_dir, "metadata.json"), "w") as f:
json.dump(metadata, f, indent=4)
print(f"Validation set generated in {output_dir}")
if __name__ == "__main__":
# Seed for reproducibility of the validation set itself
random.seed(42)
generate_fixed_val_set()

81
src/data/generator.py Normal file
View File

@@ -0,0 +1,81 @@
import numpy as np
import cv2
from typing import Tuple, List
# Image and Grid constants
IMG_SIZE = 128
GRID_SIZE = 16
SPOT_RADIUS = 6 # Approximate radius in pixels
# Colors (BGR)
FACE_COLOR = (180, 220, 240) # Pale cream/tan
SPOT_COLOR = (50, 50, 200) # Reddish-orange
EYE_COLOR = (0, 0, 0)
def extract_coords(pid: int, mode: str = "standard") -> List[Tuple[int, int]]:
"""Extracts four (x, y) coordinates from a 32-bit PID/EC.
Standard (Little-Endian): Byte 0 (LL), 1 (LR), 2 (UL), 3 (UR)
BDSP (Big-Endian): Byte 3 (LL), 2 (LR), 1 (UL), 0 (UR)
"""
bytes_list = [
(pid >> 0) & 0xFF, # Byte 0
(pid >> 8) & 0xFF, # Byte 1
(pid >> 16) & 0xFF, # Byte 2
(pid >> 24) & 0xFF, # Byte 3
]
if mode == "bdsp":
# BDSP reads the bytes in reverse order
ordered_bytes = [bytes_list[3], bytes_list[2], bytes_list[1], bytes_list[0]]
else:
ordered_bytes = bytes_list
coords = []
for byte in ordered_bytes:
x = byte & 0x0F
y = (byte >> 4) & 0x0F
coords.append((x, y))
return coords
def generate_spinda_face(pid: int, mode: str = "standard") -> np.ndarray:
"""Generates a procedural Spinda face with spots based on the PID."""
# Create blank canvas (white)
img = np.ones((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8) * 255
# Draw Ears
cv2.circle(img, (IMG_SIZE // 2 - 40, IMG_SIZE // 2 - 50), 20, FACE_COLOR, -1)
cv2.circle(img, (IMG_SIZE // 2 - 40, IMG_SIZE // 2 - 50), 20, (0, 0, 0), 2)
cv2.circle(img, (IMG_SIZE // 2 + 40, IMG_SIZE // 2 - 50), 20, FACE_COLOR, -1)
cv2.circle(img, (IMG_SIZE // 2 + 40, IMG_SIZE // 2 - 50), 20, (0, 0, 0), 2)
# Draw main face (oval)
center = (IMG_SIZE // 2, IMG_SIZE // 2)
axes = (50, 60)
cv2.ellipse(img, center, axes, 0, 0, 360, FACE_COLOR, -1)
cv2.ellipse(img, center, axes, 0, 0, 360, (0, 0, 0), 2) # Outline
# Fixed Eyes
cv2.circle(img, (center[0] - 15, center[1] - 10), 4, EYE_COLOR, -1)
cv2.circle(img, (center[0] + 15, center[1] - 10), 4, EYE_COLOR, -1)
# Define Spot Zones (Relative to the 16x16 grid)
# Heuristic mapping for a more "Pokemon-like" layout
# Spot 1 (LL), Spot 2 (LR), Spot 3 (UL), Spot 4 (UR)
# These offsets are designed to place spots in their respective quadrants
quadrant_offsets = [
(center[0] - 45, center[1] + 5), # LL (Face)
(center[0] + 5, center[1] + 5), # LR (Face)
(center[0] - 45, center[1] - 55), # UL (Ear/Upper Face)
(center[0] + 5, center[1] - 55), # UR (Ear/Upper Face)
]
coords = extract_coords(pid, mode=mode)
for i, (x, y) in enumerate(coords):
offset_x, offset_y = quadrant_offsets[i]
px = int(offset_x + x * 2.5) # Scale grid to quadrant
py = int(offset_y + y * 2.5)
cv2.circle(img, (px, py), SPOT_RADIUS, SPOT_COLOR, -1)
return img

View File

@@ -0,0 +1,95 @@
import numpy as np
import cv2
from PIL import Image
import os
from typing import List, Tuple
# Constants
IMG_SIZE = 128
BASE_PATH = "assets"
# Offsets from ProfessorRex's Spinda_generator.py (Observer Perspective)
# This matches the script's PID_to_Coordinates logic:
# TL (Spot 1): Nibble 0, 1 (PID[-1], PID[-2])
# TR (Spot 2): Nibble 2, 3 (PID[-3], PID[-4]) + (24, 1)
# BL (Spot 3): Nibble 4, 5 (PID[3], PID[2]) + (6, 18)
# BR (Spot 4): Nibble 6, 7 (PID[1], PID[0]) + (18, 19)
SPOT_BASE_OFFSETS = [
(0, 0), # Spot 1 (TL)
(24, 1), # Spot 2 (TR)
(6, 18), # Spot 3 (BL)
(18, 19), # Spot 4 (BR)
]
def extract_coords_rex(pid_hex: str) -> List[Tuple[int, int]]:
"""Extracts coordinates following ProfessorRex's PID_to_Coordinates logic.
PID is expected to be an 8-char hex string.
"""
pid = pid_hex.lower().zfill(8)
TL = (int(pid[-1], 16), int(pid[-2], 16))
TR = (int(pid[-3], 16) + 24, int(pid[-4], 16) + 1)
BL = (int(pid[3], 16) + 6, int(pid[2], 16) + 18)
BR = (int(pid[1], 16) + 18, int(pid[0], 16) + 19)
return [TL, TR, BL, BR]
def generate_high_fidelity_spinda(
pid: int,
bg_color: tuple[int, int, int] = (255, 255, 255),
) -> np.ndarray:
"""Generates a high-fidelity Spinda face using correctly sized assets."""
pid_hex = f"{pid:08x}"
# 1. Load Assets
base_img = Image.open(os.path.join(BASE_PATH, "Spinda_Base_Top.png")).convert("RGBA")
head_img = Image.open(os.path.join(BASE_PATH, "Spinda_Head.png")).convert("RGBA")
head_data = np.array(head_img)
spot_names = ["Spot_TL.png", "Spot_TR.png", "Spot_BL.png", "Spot_BR.png"]
spots = [Image.open(os.path.join(BASE_PATH, name)).convert("RGBA") for name in spot_names]
# 2. Create Pattern Layer (Integer grid like in Rex's script)
W, H = base_img.size
pattern_grid = np.zeros((H, W), dtype=np.uint8)
coords = extract_coords_rex(pid_hex)
for i, (px_start, py_start) in enumerate(coords):
spot_arr = np.array(spots[i])
sh, sw = spot_arr.shape[:2]
for sy in range(sh):
for sx in range(sw):
# If spot pixel is white (active) in the mask
# Check for white pixels (R,G,B > 200)
if spot_arr[sy, sx, 0] > 200:
tx, ty = px_start + sx, py_start + sy
if 0 <= tx < W and 0 <= ty < H:
# Mark as active
pattern_grid[ty, tx] = 1
# 3. Colourize
# Create an empty RGBA layer for the spots
spot_layer = np.zeros((H, W, 4), dtype=np.uint8)
for y in range(H):
for x in range(W):
if pattern_grid[y, x] > 0:
# Take color from Spinda_Head.png
spot_layer[y, x] = head_data[y, x]
# 4. Composite
spot_layer_img = Image.fromarray(spot_layer, "RGBA")
combined = Image.alpha_composite(base_img, spot_layer_img)
# 5. Final Canvas (128x128)
final_img = Image.new("RGBA", (IMG_SIZE, IMG_SIZE), (*bg_color, 255))
offset = ((IMG_SIZE - W) // 2, (IMG_SIZE - H) // 2)
final_img.paste(combined, offset, combined)
return cv2.cvtColor(np.array(final_img), cv2.COLOR_RGBA2BGR)
if __name__ == "__main__":
# Test with 0x12345678
test_pid = 0x12345678
img = generate_high_fidelity_spinda(test_pid)
cv2.imwrite("sample_high_fidelity_v3.png", img)
print(f"Corrected High-fidelity sample saved to sample_high_fidelity_v3.png for PID: {hex(test_pid)}")

View File

@@ -0,0 +1,21 @@
import cv2
from src.data.generator import generate_spinda_face
def save_sample_image():
# Example PID (from documentation/common examples if possible)
# Let's use a PID that should have distinct spots
# 0x12345678 ->
# Byte 0: 0x78 (X=8, Y=7)
# Byte 1: 0x56 (X=6, Y=5)
# Byte 2: 0x34 (X=4, Y=3)
# Byte 3: 0x12 (X=2, Y=1)
test_pid = 0x12345678
img = generate_spinda_face(test_pid)
# Save the image
cv2.imwrite("sample_spinda.png", img)
print(f"Sample Spinda image saved to sample_spinda.png for PID: {hex(test_pid)}")
if __name__ == "__main__":
save_sample_image()

58
src/models/inference.py Normal file
View File

@@ -0,0 +1,58 @@
import cv2
import numpy as np
import torch
import torchvision.transforms.v2 as T
from PIL import Image
from typing import List, Tuple
from src.models.regression_model import SpindaRegressionModel
class SpindaInference:
"""Loads the trained model and predicts spot coordinates from an image crop."""
def __init__(self, model_path: str = "models/best_spinda_model.pth"):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model = SpindaRegressionModel(pretrained=False)
self.model.load_state_dict(
torch.load(model_path, map_location=self.device)
)
self.model.to(self.device).eval()
self.transform = T.Compose([
T.Resize((128, 128)),
T.ToDtype(torch.float32, scale=True),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
def predict(self, image_path: str) -> Tuple[List[int], str]:
"""Predict the 8 grid coordinates and return them with a fingerprint string.
Returns:
grid_coords: list of 8 integers in [0, 15]
fingerprint: "X1-Y1-X2-Y2-X3-Y3-X4-Y4"
"""
img_bgr = cv2.imread(image_path)
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
img_tensor = torch.from_numpy(np.array(img_rgb)).permute(2, 0, 1)
img_tensor = self.transform(img_tensor).unsqueeze(0).to(self.device)
with torch.no_grad():
logits = self.model(img_tensor) # (1, 8, 16)
grid_coords = logits.argmax(dim=2).squeeze(0).cpu().tolist() # [8]
fingerprint = "-".join(f"{c:02d}" for c in grid_coords)
return grid_coords, fingerprint
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python src/models/inference.py <image_path>")
else:
inf = SpindaInference()
coords, fingerprint = inf.predict(sys.argv[1])
print(f"Predicted Grid Coordinates: {coords}")
print(f"Visual Fingerprint: {fingerprint}")

View File

@@ -0,0 +1,47 @@
import torch
import torch.nn as nn
from torchvision import models
from torchvision.models import ResNet18_Weights, ResNet34_Weights
_BACKBONES = {
"resnet18": (models.resnet18, ResNet18_Weights.DEFAULT),
"resnet34": (models.resnet34, ResNet34_Weights.DEFAULT),
}
class SpindaRegressionModel(nn.Module):
"""ResNet backbone with 8 independent 16-class coordinate heads.
Each of the 8 output coordinates (4 spots × x, y) is treated as a
16-class classification problem over the [0, 15] nibble grid.
This eliminates the float→integer rounding step and lets CrossEntropy
directly optimise for exact coordinate prediction.
Output shape: (B, 8, 16) — unnormalised logits.
Prediction: output.argmax(dim=2) → (B, 8) integer coordinates.
"""
def __init__(self, pretrained: bool = True, backbone: str = "resnet18"):
super().__init__()
if backbone not in _BACKBONES:
raise ValueError(f"backbone must be one of {list(_BACKBONES)}; got {backbone!r}")
factory, default_weights = _BACKBONES[backbone]
weights = default_weights if pretrained else None
net = factory(weights=weights)
# Strip the final FC; keep the feature extractor + average pool.
self.features = nn.Sequential(*list(net.children())[:-1])
# 8 coordinates × 16 classes each (512-dim output for both resnet18/34)
self.classifier = nn.Linear(512, 8 * 16)
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.features(x) # (B, 512, 1, 1)
x = x.flatten(1) # (B, 512)
x = self.classifier(x) # (B, 128)
return x.view(-1, 8, 16) # (B, 8, 16)
if __name__ == "__main__":
for name in ("resnet18", "resnet34"):
model = SpindaRegressionModel(pretrained=False, backbone=name)
out = model(torch.randn(2, 3, 128, 128))
print(f"{name}: output {out.shape}, predictions {out.argmax(dim=2)}")

210
src/models/train.py Normal file
View File

@@ -0,0 +1,210 @@
import os
import json
import random
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms.v2 as T
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from src.data.dataset import SpindaDataset, get_default_augmentations
from src.models.regression_model import SpindaRegressionModel
class SpindaEvalDataset(Dataset):
"""Loads a fixed evaluation set (clean val or augmented test).
Images are stored post-augmentation, pre-normalisation; this class
applies only the normalisation step so the on-disk images are stable.
"""
def __init__(self, data_dir: str):
with open(os.path.join(data_dir, "metadata.json")) as f:
self.metadata = json.load(f)
self.transform = T.Compose([
T.ToDtype(torch.float32, scale=True),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
def __len__(self) -> int:
return len(self.metadata)
def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
item = self.metadata[idx]
img_bgr = cv2.imread(item["img_path"])
img_rgb = img_bgr[:, :, ::-1].copy()
img_tensor = torch.from_numpy(img_rgb).permute(2, 0, 1)
img_tensor = self.transform(img_tensor)
target = torch.tensor(item["target"], dtype=torch.long)
return img_tensor, target
# BL_x (index 4) and BL_y (index 5) are the weakest coordinates; upweight them
# so the optimiser focuses more of its gradient on the hardest spot.
_COORD_WEIGHTS = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.5, 2.5, 1.0, 1.0])
def _weighted_loss(
logits: torch.Tensor, targets: torch.Tensor, weights: torch.Tensor
) -> torch.Tensor:
"""CrossEntropy with per-coordinate weights. logits: (B,8,16), targets: (B,8)."""
B = logits.size(0)
per = F.cross_entropy(logits.view(-1, 16), targets.view(-1), reduction="none")
return (per.view(B, 8) * weights).mean()
def _worker_init_fn(worker_id: int) -> None:
"""Give each DataLoader worker a unique random seed so they generate different PIDs."""
seed = torch.initial_seed() % (2 ** 32)
random.seed(seed + worker_id)
np.random.seed(seed + worker_id)
def train_model(
epochs: int = 50,
batch_size: int = 64,
lr: float = 1e-4,
resume: bool = False,
model_path: str = "models/best_spinda_model.pth",
num_workers: int = 4,
epoch_size: int = 200000,
backbone: str = "resnet18",
save_path: str = "",
) -> None:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
train_ds = SpindaDataset(size=epoch_size, transform=get_default_augmentations())
val_ds = SpindaEvalDataset("data/val")
aug_test_ds = SpindaEvalDataset("data/aug_test") if os.path.exists("data/aug_test") else None
train_loader = DataLoader(
train_ds, batch_size=batch_size, shuffle=True,
num_workers=num_workers, worker_init_fn=_worker_init_fn if num_workers > 0 else None,
persistent_workers=num_workers > 0, pin_memory=True,
)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0)
aug_loader = DataLoader(aug_test_ds, batch_size=batch_size, shuffle=False, num_workers=0) if aug_test_ds else None
checkpoint_path = save_path or f"models/best_{backbone}_model.pth"
if resume:
model = SpindaRegressionModel(pretrained=False, backbone=backbone).to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
print(f"Resumed from {model_path}")
else:
model = SpindaRegressionModel(pretrained=True, backbone=backbone).to(device)
coord_weights = _COORD_WEIGHTS.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
optimizer, mode="min", factor=0.5, patience=3
)
best_exact_rate = 0.0
epochs_without_improvement = 0
early_stop_patience = 10
for epoch in range(epochs):
# ── Training ──────────────────────────────────────────────────
model.train()
train_loss = 0.0
pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} [Train]")
for images, targets in pbar:
images = images.to(device)
targets = targets.to(device) # (B, 8) long, values 0-15
optimizer.zero_grad()
logits = model(images) # (B, 8, 16)
loss = _weighted_loss(logits, targets, coord_weights)
loss.backward()
optimizer.step()
train_loss += loss.item() * images.size(0)
pbar.set_postfix({"loss": f"{loss.item():.4f}"})
train_loss /= len(train_loader.dataset)
# ── Validation ────────────────────────────────────────────────
model.eval()
val_loss = 0.0
exact_matches = 0
with torch.no_grad():
for images, targets in tqdm(val_loader, desc=f"Epoch {epoch + 1}/{epochs} [Val]"):
images = images.to(device)
targets = targets.to(device)
logits = model(images) # (B, 8, 16)
loss = F.cross_entropy(logits.view(-1, 16), targets.view(-1))
val_loss += loss.item() * images.size(0)
preds = logits.argmax(dim=2) # (B, 8)
exact_matches += torch.all(preds == targets, dim=1).sum().item()
val_loss /= len(val_loader.dataset)
exact_rate = exact_matches / len(val_loader.dataset)
# ── Augmented test set ────────────────────────────────────────
aug_exact_rate = 0.0
if aug_loader is not None:
aug_exact = 0
with torch.no_grad():
for images, targets in aug_loader:
images, targets = images.to(device), targets.to(device)
logits = model(images)
preds = logits.argmax(dim=2)
aug_exact += torch.all(preds == targets, dim=1).sum().item()
aug_exact_rate = aug_exact / len(aug_test_ds)
aug_str = f" Aug Test: {aug_exact_rate:.2%}" if aug_loader else ""
print(
f"Epoch {epoch + 1}: "
f"Train Loss: {train_loss:.4f} "
f"Val Loss: {val_loss:.4f} "
f"Clean Val: {exact_rate:.2%}"
f"{aug_str}"
)
scheduler.step(val_loss)
# Save on exact-match improvement (the metric that actually matters)
if exact_rate > best_exact_rate:
best_exact_rate = exact_rate
epochs_without_improvement = 0
os.makedirs("models", exist_ok=True)
os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
torch.save(model.state_dict(), checkpoint_path)
print(f" → Saved best model to {checkpoint_path} (clean val exact match: {best_exact_rate:.2%})")
else:
epochs_without_improvement += 1
if epochs_without_improvement >= early_stop_patience:
print(f" → No improvement for {early_stop_patience} epochs. Stopping early.")
break
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=50)
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--lr", type=float, default=1e-4)
parser.add_argument("--resume", action="store_true", help="Fine-tune from --model_path checkpoint")
parser.add_argument("--model_path", type=str, default="models/best_spinda_model.pth")
parser.add_argument("--num_workers", type=int, default=4, help="DataLoader worker count (0 = main process only)")
parser.add_argument("--epoch_size", type=int, default=200000, help="Virtual dataset size per epoch")
parser.add_argument("--backbone", type=str, default="resnet18", choices=["resnet18", "resnet34"])
parser.add_argument("--save_path", type=str, default="", help="Override checkpoint save path")
args = parser.parse_args()
train_model(
epochs=args.epochs, batch_size=args.batch_size, lr=args.lr,
resume=args.resume, model_path=args.model_path,
num_workers=args.num_workers, epoch_size=args.epoch_size,
backbone=args.backbone, save_path=args.save_path,
)

61
src/registry/database.py Normal file
View File

@@ -0,0 +1,61 @@
import sqlite3
import os
from typing import List, Optional, Tuple
class SpindaRegistry:
"""Handles the storage and lookup of Spinda PIDs based on their visual fingerprints."""
def __init__(self, db_path: str = "data/spinda_registry.db"):
self.db_path = db_path
os.makedirs(os.path.dirname(self.db_path), exist_ok=True)
self._init_db()
def _init_db(self):
"""Initializes the database with a core table for PIDs and Fingerprints."""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
# Primary table: Mapping the 8-integer fingerprint to PIDs
# Fingerprint format: "X1-Y1-X2-Y2-X3-Y3-X4-Y4"
cursor.execute('''
CREATE TABLE IF NOT EXISTS registry (
id INTEGER PRIMARY KEY AUTOINCREMENT,
fingerprint TEXT NOT NULL,
pid_hex TEXT NOT NULL,
UNIQUE(fingerprint, pid_hex)
)
''')
# Index for fast O(1)-style lookups by fingerprint
cursor.execute('CREATE INDEX IF NOT EXISTS idx_fingerprint ON registry(fingerprint)')
conn.commit()
def add_entry(self, fingerprint: str, pid_hex: str):
"""Adds a new Spinda entry to the registry."""
try:
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute(
"INSERT INTO registry (fingerprint, pid_hex) VALUES (?, ?)",
(fingerprint, pid_hex)
)
conn.commit()
except sqlite3.IntegrityError:
# Entry already exists
pass
def lookup_by_fingerprint(self, fingerprint: str) -> List[str]:
"""Returns all PIDs associated with a specific visual fingerprint."""
with sqlite3.connect(self.db_path) as conn:
cursor = conn.cursor()
cursor.execute("SELECT pid_hex FROM registry WHERE fingerprint = ?", (fingerprint,))
results = cursor.fetchall()
return [row[0] for row in results]
if __name__ == "__main__":
# Quick test
reg = SpindaRegistry("data/test_registry.db")
test_fp = "00-01-02-03-04-05-06-07"
test_pid = "ABCDE123"
reg.add_entry(test_fp, test_pid)
matches = reg.lookup_by_fingerprint(test_fp)
print(f"Looked up {test_fp}, found PIDs: {matches}")

View File

@@ -0,0 +1,15 @@
Metadata-Version: 2.4
Name: scrgr
Version: 0.1.0
Summary: Spinda Coordinate Regression & Global Registry
Requires-Python: >=3.10
Requires-Dist: torch>=2.0.0
Requires-Dist: torchvision>=0.15.0
Requires-Dist: opencv-python>=4.7.0
Requires-Dist: Pillow>=9.5.0
Requires-Dist: numpy>=1.24.0
Provides-Extra: dev
Requires-Dist: pytest>=7.3.0; extra == "dev"
Requires-Dist: ruff>=0.0.270; extra == "dev"
Requires-Dist: mypy>=1.3.0; extra == "dev"
Requires-Dist: tqdm>=4.65.0; extra == "dev"

View File

@@ -0,0 +1,12 @@
pyproject.toml
src/data/dataset.py
src/data/generate_val_set.py
src/data/generator.py
src/data/high_fidelity_generator.py
src/data/visualize_sample.py
src/models/regression_model.py
src/scrgr.egg-info/PKG-INFO
src/scrgr.egg-info/SOURCES.txt
src/scrgr.egg-info/dependency_links.txt
src/scrgr.egg-info/requires.txt
src/scrgr.egg-info/top_level.txt

View File

@@ -0,0 +1 @@

View File

@@ -0,0 +1,11 @@
torch>=2.0.0
torchvision>=0.15.0
opencv-python>=4.7.0
Pillow>=9.5.0
numpy>=1.24.0
[dev]
pytest>=7.3.0
ruff>=0.0.270
mypy>=1.3.0
tqdm>=4.65.0

View File

@@ -0,0 +1,4 @@
data
models
registry
utils

284
src/utils/detector.py Normal file
View File

@@ -0,0 +1,284 @@
import math
import cv2
import numpy as np
from typing import Optional
class SpindaDetector:
"""Detects and crops a Spinda face from a larger image.
Two-tier strategy:
Tier 1 — screenshot / clean image: find individual red spot candidates,
cluster them, and derive the crop from known spot geometry.
Tier 2 — real photos where spots merge: find the Spinda body blob and
crop the face region from its top portion, using the body width as a
scale reference.
Both tiers produce a 128×128 BGR image calibrated to the training-data
layout (Spinda_Base_Top.png, 52×43, centred in a 128×128 canvas).
"""
# ── Tier 1 constants ────────────────────────────────────────────────
# spot cluster span ≈ 24.5 px in the sprite → CROP_RATIO = 128/24.5 ≈ 5.22
# Using 5.5 gives a 5 % margin over the exact ratio.
_SPOT_CROP_RATIO: float = 5.5
# The spot centroid sits at 44.4 % from the top of the 128×128 training crop.
# Shifting the crop centre down by this fraction places the centroid there.
_SPOT_CENTER_OFFSET: float = 0.056
# ── Tier 2 constants ────────────────────────────────────────────────
# In the sprite: face (Spinda_Base_Top) = 52×43; full body = 52×58.
# blob_w ≈ 52 sprite px → scale = blob_w / 52.
# crop_side = 128 * scale (same canvas ratio as training).
_FACE_W_PX: float = 52.0
_FACE_H_PX: float = 43.0
_CANVAS_PX: float = 128.0
# Blob scoring: reward area (log-scale) and penalise non-square aspect.
# Spinda body aspect (h/w) ≈ 58/52 = 1.12.
_BLOB_IDEAL_ASPECT: float = 1.12
def detect_and_crop(self, image_path: str) -> Optional[np.ndarray]:
"""Locate the Spinda face and return a 128×128 BGR crop, or None."""
img = cv2.imread(image_path)
if img is None:
print(f"Error: could not read {image_path}")
return None
# Tier 1: individual spot candidates → tight cluster
candidates = self._find_spot_candidates(img)
if len(candidates) >= 2:
cluster = self._find_tightest_cluster(candidates, img.shape)
if cluster is not None:
result = self._crop_from_cluster(img, cluster)
if result is not None:
return result
print("Tier 1 (spot cluster) failed; trying Tier 2 (body blob).")
# Tier 2: full Spinda body blob → face crop
blob_rect = self._find_spinda_blob(img)
if blob_rect is None:
print("Could not detect Spinda.")
return None
return self._crop_from_blob(img, blob_rect)
# ──────────────────────────────────────────────────────────────────
# Tier 1 helpers
# ──────────────────────────────────────────────────────────────────
def _find_spot_candidates(
self, img: np.ndarray
) -> list[tuple[int, int, float]]:
"""Return (cx, cy, area) for each red blob that could be a Spinda spot.
Filters by area (spots are small, not tiny specks) and circularity
(spots are roughly circular; elongated trainer-outfit blobs are excluded).
"""
h, w = img.shape[:2]
img_area = float(h * w)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
lo = cv2.inRange(hsv, np.array([0, 60, 60]), np.array([12, 255, 255]))
hi = cv2.inRange(hsv, np.array([155, 60, 60]), np.array([180, 255, 255]))
mask = cv2.bitwise_or(lo, hi)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
contours, _ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
candidates: list[tuple[int, int, float]] = []
for cnt in contours:
area = cv2.contourArea(cnt)
if not (img_area * 0.00005 < area < img_area * 0.01):
continue
perimeter = cv2.arcLength(cnt, True)
if perimeter == 0:
continue
if 4 * np.pi * area / (perimeter ** 2) < 0.25:
continue
M = cv2.moments(cnt)
if M["m00"] == 0:
continue
cx = int(M["m10"] / M["m00"])
cy = int(M["m01"] / M["m00"])
candidates.append((cx, cy, area))
return candidates
def _find_tightest_cluster(
self,
candidates: list[tuple[int, int, float]],
img_shape: tuple[int, ...],
) -> Optional[np.ndarray]:
"""Return the xy-array of the tightest cluster of up to 4 candidates.
For each candidate as seed, collects its k-1 nearest neighbours and
measures the bounding-box span. The seed whose group has the smallest
span within plausible face-size bounds wins.
"""
h, w = img_shape[:2]
img_diag = float(np.sqrt(h ** 2 + w ** 2))
# Drop candidates whose area deviates wildly from the median —
# the four spots are similar in size; UI elements are not.
areas = np.array([c[2] for c in candidates])
med = float(np.median(areas))
filtered = [c for c in candidates if med / 6.0 < c[2] < med * 6.0]
if len(filtered) < 2:
filtered = candidates
pts = np.array([(c[0], c[1]) for c in filtered], dtype=np.float32)
n = len(pts)
k = min(4, n)
best_group: Optional[np.ndarray] = None
best_span = float("inf")
for i in range(n):
dists = np.linalg.norm(pts - pts[i], axis=1)
idx = np.argsort(dists)[:k]
group = pts[idx]
span = float(max(
group[:, 0].max() - group[:, 0].min(),
group[:, 1].max() - group[:, 1].min(),
))
# Too small → degenerate noise cluster; too large → not a face.
if not (img_diag * 0.01 < span < img_diag * 0.45):
continue
if span < best_span:
best_span = span
best_group = group
return best_group
def _crop_from_cluster(
self, img: np.ndarray, cluster: np.ndarray
) -> Optional[np.ndarray]:
"""Crop based on spot-cluster geometry and resize to 128×128."""
h, w = img.shape[:2]
cx = int(cluster[:, 0].mean())
cy = int(cluster[:, 1].mean())
cluster_span = float(max(
cluster[:, 0].max() - cluster[:, 0].min(),
cluster[:, 1].max() - cluster[:, 1].min(),
))
face_side = max(
int(cluster_span * self._SPOT_CROP_RATIO),
int(min(h, w) * 0.12),
)
# Shift crop centre down so the spot centroid lands at ~44 % from the top,
# matching its position in the training images.
cy_adj = cy + int(face_side * self._SPOT_CENTER_OFFSET)
x1 = max(0, cx - face_side // 2)
y1 = max(0, cy_adj - face_side // 2)
x2 = min(w, x1 + face_side)
y2 = min(h, y1 + face_side)
crop = img[y1:y2, x1:x2]
return None if crop.size == 0 else cv2.resize(crop, (128, 128))
# ──────────────────────────────────────────────────────────────────
# Tier 2 helpers
# ──────────────────────────────────────────────────────────────────
def _find_spinda_blob(
self, img: np.ndarray
) -> Optional[tuple[int, int, int, int]]:
"""Return (x, y, w, h) of the most Spinda-like red blob.
Score = circularity + 0.2 × log(area / min_area) 0.1 × |aspect 1.12|
This rewards large, roughly circular blobs with a body-like aspect ratio,
and discards thin stripes (UI chrome) or very small stray blobs.
"""
h, w = img.shape[:2]
img_area = float(h * w)
min_area = img_area * 0.001
# Use slightly looser HSV bounds to catch faded reds in real photos.
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
lo = cv2.inRange(hsv, np.array([0, 50, 50]), np.array([15, 255, 255]))
hi = cv2.inRange(hsv, np.array([150, 50, 50]), np.array([180, 255, 255]))
mask = cv2.bitwise_or(lo, hi)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
contours, _ = cv2.findContours(
mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
)
best_rect: Optional[tuple[int, int, int, int]] = None
best_score = float("-inf")
for cnt in contours:
area = cv2.contourArea(cnt)
if not (min_area < area < img_area * 0.10):
continue
rx, ry, rw, rh = cv2.boundingRect(cnt)
if rw == 0 or rh == 0:
continue
aspect = rh / rw
if not (0.4 < aspect < 2.5):
continue
perimeter = cv2.arcLength(cnt, True)
circ = 4 * np.pi * area / (perimeter ** 2) if perimeter > 0 else 0
score = (
circ
+ 0.2 * math.log(area / min_area)
- 0.1 * abs(aspect - self._BLOB_IDEAL_ASPECT)
)
if score > best_score:
best_score = score
best_rect = (rx, ry, rw, rh)
return best_rect
def _crop_from_blob(
self, img: np.ndarray, blob_rect: tuple[int, int, int, int]
) -> Optional[np.ndarray]:
"""Crop the face region from the body blob and resize to 128×128.
The blob width corresponds to 52 sprite pixels (the face/body width).
The face occupies the top 43/58 of the body height at that scale.
The crop side = 128 × scale so the face fills the same fraction of the
output as in the training images.
"""
h, w = img.shape[:2]
bx, by, bw, _ = blob_rect
scale = bw / self._FACE_W_PX
face_h = int(self._FACE_H_PX * scale)
crop_side = max(int(self._CANVAS_PX * scale), int(min(h, w) * 0.12))
face_cx = bx + bw // 2
face_cy = by + face_h // 2 # face centre = top of blob + half face height
x1 = max(0, face_cx - crop_side // 2)
y1 = max(0, face_cy - crop_side // 2)
x2 = min(w, x1 + crop_side)
y2 = min(h, y1 + crop_side)
crop = img[y1:y2, x1:x2]
return None if crop.size == 0 else cv2.resize(crop, (128, 128))
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python src/utils/detector.py <image_path>")
else:
det = SpindaDetector()
crop = det.detect_and_crop(sys.argv[1])
if crop is not None:
cv2.imwrite("debug_crop.png", crop)
print("Saved debug_crop.png")
else:
print("Failed to detect Spinda.")

61
src/utils/resolver.py Normal file
View File

@@ -0,0 +1,61 @@
from typing import List, Tuple
class SpindaResolver:
"""Mathematically resolves a visual fingerprint back to its possible PIDs."""
@staticmethod
def coordinates_to_pid(coords: List[int], mode: str = "standard") -> str:
"""
Converts 8 grid coordinates (0-15) back to a 32-bit hex PID.
Standard Mapping (Rex/Little-Endian):
Byte 0 (TL): x=coords[0], y=coords[1]
Byte 1 (TR): x=coords[2], y=coords[3]
Byte 2 (BL): x=coords[4], y=coords[5]
Byte 3 (BR): x=coords[6], y=coords[7]
"""
# Each byte = (Y << 4) | X
bytes_list = []
for i in range(0, 8, 2):
x = coords[i]
y = coords[i+1]
byte = (y << 4) | x
bytes_list.append(byte)
if mode == "bdsp":
# BDSP reads the bytes in reverse order (Big-Endian style)
# So we reverse them back
bytes_list = bytes_list[::-1]
# Combine bytes into 32-bit integer
pid = 0
for i, byte in enumerate(bytes_list):
pid |= (byte << (i * 8))
return f"{pid:08x}"
@staticmethod
def resolve_fingerprint(fingerprint: str) -> dict:
"""
Takes a fingerprint string "X1-Y1-X2-Y2-X3-Y3-X4-Y4"
and returns both possible PIDs (Standard and BDSP).
"""
coords = [int(c) for c in fingerprint.split("-")]
return {
"standard": SpindaResolver.coordinates_to_pid(coords, mode="standard"),
"bdsp": SpindaResolver.coordinates_to_pid(coords, mode="bdsp")
}
if __name__ == "__main__":
# Test with a known fingerprint
# PID 0x12345678 ->
# Byte 0: 0x78 (X=8, Y=7)
# Byte 1: 0x56 (X=6, Y=5)
# Byte 2: 0x34 (X=4, Y=3)
# Byte 3: 0x12 (X=2, Y=1)
# Fingerprint: 08-07-06-05-04-03-02-01
test_fp = "08-07-06-05-04-03-02-01"
results = SpindaResolver.resolve_fingerprint(test_fp)
print(f"Fingerprint: {test_fp}")
print(f"Resolved PIDs: {results}")