Skip to content

CheckerboardDataset

Methods and Attributes

Bases: BaseSyntheticDataset

Generates points in a 2D checkerboard pattern using rejection sampling.

Parameters:

Name Type Description Default
n_samples int

Target number of samples. Default: 2000.

2000
range_limit float

Defines the square region [-lim, lim] x [-lim, lim]. Default: 4.0.

4.0
noise float

Small Gaussian noise added to points. Default: 0.01.

0.01
device Optional[Union[str, device]]

Device for the tensor.

None
dtype dtype

Data type for the tensor. Default: torch.float32.

float32
seed Optional[int]

Random seed for reproducibility.

None
Source code in torchebm/datasets/generators.py
class CheckerboardDataset(BaseSyntheticDataset):
    """
    Generates points in a 2D checkerboard pattern using rejection sampling.

    Args:
        n_samples (int): Target number of samples. Default: 2000.
        range_limit (float): Defines the square region [-lim, lim] x [-lim, lim]. Default: 4.0.
        noise (float): Small Gaussian noise added to points. Default: 0.01.
        device (Optional[Union[str, torch.device]]): Device for the tensor.
        dtype (torch.dtype): Data type for the tensor. Default: torch.float32.
        seed (Optional[int]): Random seed for reproducibility.
    """

    def __init__(
        self,
        n_samples: int = 2000,
        range_limit: float = 4.0,
        noise: float = 0.01,
        device: Optional[Union[str, torch.device]] = None,
        dtype: torch.dtype = torch.float32,
        seed: Optional[int] = None,
    ):
        self.range_limit = range_limit
        self.noise = noise
        super().__init__(n_samples=n_samples, device=device, dtype=dtype, seed=seed)

    def _generate_data(self) -> torch.Tensor:
        # Logic from make_checkerboard
        collected_samples = []
        target = self.n_samples
        # Estimate batch size needed (density is ~0.5)
        batch_size = max(1000, int(target * 2.5))  # Generate more than needed per batch

        while len(collected_samples) < target:
            x = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)
            y = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)

            keep = (np.floor(x) + np.floor(y)) % 2 != 0
            valid_points = np.vstack((x[keep], y[keep])).T.astype(np.float32)

            needed = target - len(collected_samples)
            collected_samples.extend(valid_points[:needed])  # Add only needed points

        X = np.array(
            collected_samples[:target], dtype=np.float32
        )  # Ensure exact n_samples
        tensor_data = torch.from_numpy(X)
        tensor_data += torch.randn_like(tensor_data) * self.noise

        return tensor_data

range_limit instance-attribute

range_limit = range_limit

noise instance-attribute

noise = noise

_generate_data

_generate_data() -> torch.Tensor
Source code in torchebm/datasets/generators.py
def _generate_data(self) -> torch.Tensor:
    # Logic from make_checkerboard
    collected_samples = []
    target = self.n_samples
    # Estimate batch size needed (density is ~0.5)
    batch_size = max(1000, int(target * 2.5))  # Generate more than needed per batch

    while len(collected_samples) < target:
        x = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)
        y = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)

        keep = (np.floor(x) + np.floor(y)) % 2 != 0
        valid_points = np.vstack((x[keep], y[keep])).T.astype(np.float32)

        needed = target - len(collected_samples)
        collected_samples.extend(valid_points[:needed])  # Add only needed points

    X = np.array(
        collected_samples[:target], dtype=np.float32
    )  # Ensure exact n_samples
    tensor_data = torch.from_numpy(X)
    tensor_data += torch.randn_like(tensor_data) * self.noise

    return tensor_data