Skip to content

TorchEBM

CheckerboardDataset

soran-ghaderi/torchebm

CheckerboardDataset

Methods and Attributes¶

Bases: BaseSyntheticDataset

Generates points in a 2D checkerboard pattern using rejection sampling.

Parameters:

Name	Type	Description	Default
`n_samples`	`int`	Target number of samples. Default: 2000.	`2000`
`range_limit`	`float`	Defines the square region [-lim, lim] x [-lim, lim]. Default: 4.0.	`4.0`
`noise`	`float`	Small Gaussian noise added to points. Default: 0.01.	`0.01`
`device`	`Optional[Union[str, device]]`	Device for the tensor.	`None`
`dtype`	`dtype`	Data type for the tensor. Default: torch.float32.	`float32`
`seed`	`Optional[int]`	Random seed for reproducibility.	`None`

Source code in torchebm/datasets/generators.py

class CheckerboardDataset(BaseSyntheticDataset):
    """
    Generates points in a 2D checkerboard pattern using rejection sampling.

    Args:
        n_samples (int): Target number of samples. Default: 2000.
        range_limit (float): Defines the square region [-lim, lim] x [-lim, lim]. Default: 4.0.
        noise (float): Small Gaussian noise added to points. Default: 0.01.
        device (Optional[Union[str, torch.device]]): Device for the tensor.
        dtype (torch.dtype): Data type for the tensor. Default: torch.float32.
        seed (Optional[int]): Random seed for reproducibility.
    """

    def __init__(
        self,
        n_samples: int = 2000,
        range_limit: float = 4.0,
        noise: float = 0.01,
        device: Optional[Union[str, torch.device]] = None,
        dtype: torch.dtype = torch.float32,
        seed: Optional[int] = None,
    ):
        self.range_limit = range_limit
        self.noise = noise
        super().__init__(n_samples=n_samples, device=device, dtype=dtype, seed=seed)

    def _generate_data(self) -> torch.Tensor:
        # Logic from make_checkerboard
        collected_samples = []
        target = self.n_samples
        # Estimate batch size needed (density is ~0.5)
        batch_size = max(1000, int(target * 2.5))  # Generate more than needed per batch

        while len(collected_samples) < target:
            x = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)
            y = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)

            keep = (np.floor(x) + np.floor(y)) % 2 != 0
            valid_points = np.vstack((x[keep], y[keep])).T.astype(np.float32)

            needed = target - len(collected_samples)
            collected_samples.extend(valid_points[:needed])  # Add only needed points

        X = np.array(
            collected_samples[:target], dtype=np.float32
        )  # Ensure exact n_samples
        tensor_data = torch.from_numpy(X)
        tensor_data += torch.randn_like(tensor_data) * self.noise

        return tensor_data

range_limit `instance-attribute` ¶

range_limit = range_limit

noise `instance-attribute` ¶

noise = noise

_generate_data ¶

_generate_data() -> torch.Tensor

Source code in torchebm/datasets/generators.py

def _generate_data(self) -> torch.Tensor:
    # Logic from make_checkerboard
    collected_samples = []
    target = self.n_samples
    # Estimate batch size needed (density is ~0.5)
    batch_size = max(1000, int(target * 2.5))  # Generate more than needed per batch

    while len(collected_samples) < target:
        x = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)
        y = np.random.uniform(-self.range_limit, self.range_limit, size=batch_size)

        keep = (np.floor(x) + np.floor(y)) % 2 != 0
        valid_points = np.vstack((x[keep], y[keep])).T.astype(np.float32)

        needed = target - len(collected_samples)
        collected_samples.extend(valid_points[:needed])  # Add only needed points

    X = np.array(
        collected_samples[:target], dtype=np.float32
    )  # Ensure exact n_samples
    tensor_data = torch.from_numpy(X)
    tensor_data += torch.randn_like(tensor_data) * self.noise

    return tensor_data