Skip to content

Processing Module

gigaspatial.processing

DataStore

Bases: ABC

Abstract base class defining the interface for data store implementations. This class serves as a parent for both local and cloud-based storage solutions.

Source code in gigaspatial/core/io/data_store.py
class DataStore(ABC):
    """
    Abstract base class defining the interface for data store implementations.
    This class serves as a parent for both local and cloud-based storage solutions.
    """

    @abstractmethod
    def read_file(self, path: str) -> Any:
        """
        Read contents of a file from the data store.

        Args:
            path: Path to the file to read

        Returns:
            Contents of the file

        Raises:
            IOError: If file cannot be read
        """
        pass

    @abstractmethod
    def write_file(self, path: str, data: Any) -> None:
        """
        Write data to a file in the data store.

        Args:
            path: Path where to write the file
            data: Data to write to the file

        Raises:
            IOError: If file cannot be written
        """
        pass

    @abstractmethod
    def file_exists(self, path: str) -> bool:
        """
        Check if a file exists in the data store.

        Args:
            path: Path to check

        Returns:
            True if file exists, False otherwise
        """
        pass

    @abstractmethod
    def list_files(self, path: str) -> List[str]:
        """
        List all files in a directory.

        Args:
            path: Directory path to list

        Returns:
            List of file paths in the directory
        """
        pass

    @abstractmethod
    def walk(self, top: str) -> Generator:
        """
        Walk through directory tree, similar to os.walk().

        Args:
            top: Starting directory for the walk

        Returns:
            Generator yielding tuples of (dirpath, dirnames, filenames)
        """
        pass

    @abstractmethod
    def open(self, file: str, mode: str = "r") -> Union[str, bytes]:
        """
        Context manager for file operations.

        Args:
            file: Path to the file
            mode: File mode ('r', 'w', 'rb', 'wb')

        Yields:
            File-like object

        Raises:
            IOError: If file cannot be opened
        """
        pass

    @abstractmethod
    def is_file(self, path: str) -> bool:
        """
        Check if path points to a file.

        Args:
            path: Path to check

        Returns:
            True if path is a file, False otherwise
        """
        pass

    @abstractmethod
    def is_dir(self, path: str) -> bool:
        """
        Check if path points to a directory.

        Args:
            path: Path to check

        Returns:
            True if path is a directory, False otherwise
        """
        pass

    @abstractmethod
    def remove(self, path: str) -> None:
        """
        Remove a file.

        Args:
            path: Path to the file to remove

        Raises:
            IOError: If file cannot be removed
        """
        pass

    @abstractmethod
    def rmdir(self, dir: str) -> None:
        """
        Remove a directory and all its contents.

        Args:
            dir: Path to the directory to remove

        Raises:
            IOError: If directory cannot be removed
        """
        pass

file_exists(path) abstractmethod

Check if a file exists in the data store.

Parameters:

Name Type Description Default
path str

Path to check

required

Returns:

Type Description
bool

True if file exists, False otherwise

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def file_exists(self, path: str) -> bool:
    """
    Check if a file exists in the data store.

    Args:
        path: Path to check

    Returns:
        True if file exists, False otherwise
    """
    pass

is_dir(path) abstractmethod

Check if path points to a directory.

Parameters:

Name Type Description Default
path str

Path to check

required

Returns:

Type Description
bool

True if path is a directory, False otherwise

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def is_dir(self, path: str) -> bool:
    """
    Check if path points to a directory.

    Args:
        path: Path to check

    Returns:
        True if path is a directory, False otherwise
    """
    pass

is_file(path) abstractmethod

Check if path points to a file.

Parameters:

Name Type Description Default
path str

Path to check

required

Returns:

Type Description
bool

True if path is a file, False otherwise

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def is_file(self, path: str) -> bool:
    """
    Check if path points to a file.

    Args:
        path: Path to check

    Returns:
        True if path is a file, False otherwise
    """
    pass

list_files(path) abstractmethod

List all files in a directory.

Parameters:

Name Type Description Default
path str

Directory path to list

required

Returns:

Type Description
List[str]

List of file paths in the directory

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def list_files(self, path: str) -> List[str]:
    """
    List all files in a directory.

    Args:
        path: Directory path to list

    Returns:
        List of file paths in the directory
    """
    pass

open(file, mode='r') abstractmethod

Context manager for file operations.

Parameters:

Name Type Description Default
file str

Path to the file

required
mode str

File mode ('r', 'w', 'rb', 'wb')

'r'

Yields:

Type Description
Union[str, bytes]

File-like object

Raises:

Type Description
IOError

If file cannot be opened

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def open(self, file: str, mode: str = "r") -> Union[str, bytes]:
    """
    Context manager for file operations.

    Args:
        file: Path to the file
        mode: File mode ('r', 'w', 'rb', 'wb')

    Yields:
        File-like object

    Raises:
        IOError: If file cannot be opened
    """
    pass

read_file(path) abstractmethod

Read contents of a file from the data store.

Parameters:

Name Type Description Default
path str

Path to the file to read

required

Returns:

Type Description
Any

Contents of the file

Raises:

Type Description
IOError

If file cannot be read

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def read_file(self, path: str) -> Any:
    """
    Read contents of a file from the data store.

    Args:
        path: Path to the file to read

    Returns:
        Contents of the file

    Raises:
        IOError: If file cannot be read
    """
    pass

remove(path) abstractmethod

Remove a file.

Parameters:

Name Type Description Default
path str

Path to the file to remove

required

Raises:

Type Description
IOError

If file cannot be removed

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def remove(self, path: str) -> None:
    """
    Remove a file.

    Args:
        path: Path to the file to remove

    Raises:
        IOError: If file cannot be removed
    """
    pass

rmdir(dir) abstractmethod

Remove a directory and all its contents.

Parameters:

Name Type Description Default
dir str

Path to the directory to remove

required

Raises:

Type Description
IOError

If directory cannot be removed

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def rmdir(self, dir: str) -> None:
    """
    Remove a directory and all its contents.

    Args:
        dir: Path to the directory to remove

    Raises:
        IOError: If directory cannot be removed
    """
    pass

walk(top) abstractmethod

Walk through directory tree, similar to os.walk().

Parameters:

Name Type Description Default
top str

Starting directory for the walk

required

Returns:

Type Description
Generator

Generator yielding tuples of (dirpath, dirnames, filenames)

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def walk(self, top: str) -> Generator:
    """
    Walk through directory tree, similar to os.walk().

    Args:
        top: Starting directory for the walk

    Returns:
        Generator yielding tuples of (dirpath, dirnames, filenames)
    """
    pass

write_file(path, data) abstractmethod

Write data to a file in the data store.

Parameters:

Name Type Description Default
path str

Path where to write the file

required
data Any

Data to write to the file

required

Raises:

Type Description
IOError

If file cannot be written

Source code in gigaspatial/core/io/data_store.py
@abstractmethod
def write_file(self, path: str, data: Any) -> None:
    """
    Write data to a file in the data store.

    Args:
        path: Path where to write the file
        data: Data to write to the file

    Raises:
        IOError: If file cannot be written
    """
    pass

LocalDataStore

Bases: DataStore

Implementation for local filesystem storage.

Source code in gigaspatial/core/io/local_data_store.py
class LocalDataStore(DataStore):
    """Implementation for local filesystem storage."""

    def __init__(self, base_path: Union[str, Path] = ""):
        """
        Initialize the local data store.

        Args:
            base_path: Base directory for relative paths. Defaults to current directory.
        """
        super().__init__()
        self.base_path = Path(base_path).resolve()

    def _resolve_path(self, path: Pathish) -> Path:
        path_obj = Path(path)

        # If absolute, return as-is
        if path_obj.is_absolute():
            return path_obj.resolve()

        # Otherwise, resolve relative to base_path
        return (self.base_path / path_obj).resolve()

    def read_file(self, path: str) -> bytes:
        """
        Read contents of a file as bytes.

        Args:
            path: Path to the file.

        Returns:
            File contents in bytes.
        """
        full_path = self._resolve_path(path)
        with open(full_path, "rb") as f:
            return f.read()

    def write_file(self, path: str, data: Union[bytes, str]) -> None:
        """
        Write data (string or bytes) to a file.
        Automatically creates parent directories if they don't exist.

        Args:
            path: Path where to write.
            data: Data to write (str or bytes).
        """
        full_path = self._resolve_path(path)
        self.mkdir(str(full_path.parent), exist_ok=True)

        if isinstance(data, str):
            mode = "w"
            encoding = "utf-8"
        else:
            mode = "wb"
            encoding = None

        with open(full_path, mode, encoding=encoding) as f:
            f.write(data)

    def file_exists(self, path: str) -> bool:
        """Checks if file exists at path."""
        return self._resolve_path(path).is_file()

    def list_files(self, path: str) -> List[str]:
        """
        List all files in a directory, returning relative paths from base_path.

        Args:
            path: Directory to list.

        Returns:
            List of relative file paths.
        """
        full_path = self._resolve_path(path)
        return [
            str(f.relative_to(self.base_path))
            for f in full_path.iterdir()
            if f.is_file()
        ]

    def walk(self, top: str) -> Generator[Tuple[str, List[str], List[str]], None, None]:
        """
        Walk through directory tree.

        Args:
            top: Starting directory.

        Yields:
            Tuple of (relative_root, dirnames, filenames).
        """
        full_path = self._resolve_path(top)
        for root, dirs, files in os.walk(full_path):
            rel_root = str(Path(root).relative_to(self.base_path))
            yield rel_root, dirs, files

    def list_directories(self, path: str) -> List[str]:
        """
        List immediate subdirectories in path.

        Args:
            path: Directory to list.

        Returns:
            List of subdirectory names.
        """
        full_path = self._resolve_path(path)

        if not full_path.exists():
            return []

        if not full_path.is_dir():
            return []

        return [d.name for d in full_path.iterdir() if d.is_dir()]

    def open(self, path: str, mode: str = "r") -> IO:
        """
        Open a file-like object using the local filesystem.

        Args:
            path: File path.
            mode: Open mode.

        Returns:
            File descriptor.
        """
        full_path = self._resolve_path(path)
        self.mkdir(str(full_path.parent), exist_ok=True)
        return open(full_path, mode)

    def is_file(self, path: str) -> bool:
        """Checks if path is a file."""
        return self._resolve_path(path).is_file()

    def is_dir(self, path: str) -> bool:
        """Checks if path is a directory."""
        return self._resolve_path(path).is_dir()

    def remove(self, path: str) -> None:
        """Removes a file."""
        full_path = self._resolve_path(path)
        if full_path.is_file():
            os.remove(full_path)

    def copy_file(self, src: str, dst: str) -> None:
        """Copy a file from src to dst."""
        src_path = self._resolve_path(src)
        dst_path = self._resolve_path(dst)
        self.mkdir(str(dst_path.parent), exist_ok=True)
        shutil.copy2(src_path, dst_path)

    def rmdir(self, directory: str) -> None:
        """Removes a directory."""
        full_path = self._resolve_path(directory)
        if full_path.is_dir():
            os.rmdir(full_path)

    def mkdir(self, path: str, exist_ok: bool = False) -> None:
        """Creates a directory and its parents."""
        full_path = self._resolve_path(path)
        full_path.mkdir(parents=True, exist_ok=exist_ok)

    def exists(self, path: str) -> bool:
        """Checks if path exists."""
        return self._resolve_path(path).exists()

__init__(base_path='')

Initialize the local data store.

Parameters:

Name Type Description Default
base_path Union[str, Path]

Base directory for relative paths. Defaults to current directory.

''
Source code in gigaspatial/core/io/local_data_store.py
def __init__(self, base_path: Union[str, Path] = ""):
    """
    Initialize the local data store.

    Args:
        base_path: Base directory for relative paths. Defaults to current directory.
    """
    super().__init__()
    self.base_path = Path(base_path).resolve()

copy_file(src, dst)

Copy a file from src to dst.

Source code in gigaspatial/core/io/local_data_store.py
def copy_file(self, src: str, dst: str) -> None:
    """Copy a file from src to dst."""
    src_path = self._resolve_path(src)
    dst_path = self._resolve_path(dst)
    self.mkdir(str(dst_path.parent), exist_ok=True)
    shutil.copy2(src_path, dst_path)

exists(path)

Checks if path exists.

Source code in gigaspatial/core/io/local_data_store.py
def exists(self, path: str) -> bool:
    """Checks if path exists."""
    return self._resolve_path(path).exists()

file_exists(path)

Checks if file exists at path.

Source code in gigaspatial/core/io/local_data_store.py
def file_exists(self, path: str) -> bool:
    """Checks if file exists at path."""
    return self._resolve_path(path).is_file()

is_dir(path)

Checks if path is a directory.

Source code in gigaspatial/core/io/local_data_store.py
def is_dir(self, path: str) -> bool:
    """Checks if path is a directory."""
    return self._resolve_path(path).is_dir()

is_file(path)

Checks if path is a file.

Source code in gigaspatial/core/io/local_data_store.py
def is_file(self, path: str) -> bool:
    """Checks if path is a file."""
    return self._resolve_path(path).is_file()

list_directories(path)

List immediate subdirectories in path.

Parameters:

Name Type Description Default
path str

Directory to list.

required

Returns:

Type Description
List[str]

List of subdirectory names.

Source code in gigaspatial/core/io/local_data_store.py
def list_directories(self, path: str) -> List[str]:
    """
    List immediate subdirectories in path.

    Args:
        path: Directory to list.

    Returns:
        List of subdirectory names.
    """
    full_path = self._resolve_path(path)

    if not full_path.exists():
        return []

    if not full_path.is_dir():
        return []

    return [d.name for d in full_path.iterdir() if d.is_dir()]

list_files(path)

List all files in a directory, returning relative paths from base_path.

Parameters:

Name Type Description Default
path str

Directory to list.

required

Returns:

Type Description
List[str]

List of relative file paths.

Source code in gigaspatial/core/io/local_data_store.py
def list_files(self, path: str) -> List[str]:
    """
    List all files in a directory, returning relative paths from base_path.

    Args:
        path: Directory to list.

    Returns:
        List of relative file paths.
    """
    full_path = self._resolve_path(path)
    return [
        str(f.relative_to(self.base_path))
        for f in full_path.iterdir()
        if f.is_file()
    ]

mkdir(path, exist_ok=False)

Creates a directory and its parents.

Source code in gigaspatial/core/io/local_data_store.py
def mkdir(self, path: str, exist_ok: bool = False) -> None:
    """Creates a directory and its parents."""
    full_path = self._resolve_path(path)
    full_path.mkdir(parents=True, exist_ok=exist_ok)

open(path, mode='r')

Open a file-like object using the local filesystem.

Parameters:

Name Type Description Default
path str

File path.

required
mode str

Open mode.

'r'

Returns:

Type Description
IO

File descriptor.

Source code in gigaspatial/core/io/local_data_store.py
def open(self, path: str, mode: str = "r") -> IO:
    """
    Open a file-like object using the local filesystem.

    Args:
        path: File path.
        mode: Open mode.

    Returns:
        File descriptor.
    """
    full_path = self._resolve_path(path)
    self.mkdir(str(full_path.parent), exist_ok=True)
    return open(full_path, mode)

read_file(path)

Read contents of a file as bytes.

Parameters:

Name Type Description Default
path str

Path to the file.

required

Returns:

Type Description
bytes

File contents in bytes.

Source code in gigaspatial/core/io/local_data_store.py
def read_file(self, path: str) -> bytes:
    """
    Read contents of a file as bytes.

    Args:
        path: Path to the file.

    Returns:
        File contents in bytes.
    """
    full_path = self._resolve_path(path)
    with open(full_path, "rb") as f:
        return f.read()

remove(path)

Removes a file.

Source code in gigaspatial/core/io/local_data_store.py
def remove(self, path: str) -> None:
    """Removes a file."""
    full_path = self._resolve_path(path)
    if full_path.is_file():
        os.remove(full_path)

rmdir(directory)

Removes a directory.

Source code in gigaspatial/core/io/local_data_store.py
def rmdir(self, directory: str) -> None:
    """Removes a directory."""
    full_path = self._resolve_path(directory)
    if full_path.is_dir():
        os.rmdir(full_path)

walk(top)

Walk through directory tree.

Parameters:

Name Type Description Default
top str

Starting directory.

required

Yields:

Type Description
Tuple[str, List[str], List[str]]

Tuple of (relative_root, dirnames, filenames).

Source code in gigaspatial/core/io/local_data_store.py
def walk(self, top: str) -> Generator[Tuple[str, List[str], List[str]], None, None]:
    """
    Walk through directory tree.

    Args:
        top: Starting directory.

    Yields:
        Tuple of (relative_root, dirnames, filenames).
    """
    full_path = self._resolve_path(top)
    for root, dirs, files in os.walk(full_path):
        rel_root = str(Path(root).relative_to(self.base_path))
        yield rel_root, dirs, files

write_file(path, data)

Write data (string or bytes) to a file. Automatically creates parent directories if they don't exist.

Parameters:

Name Type Description Default
path str

Path where to write.

required
data Union[bytes, str]

Data to write (str or bytes).

required
Source code in gigaspatial/core/io/local_data_store.py
def write_file(self, path: str, data: Union[bytes, str]) -> None:
    """
    Write data (string or bytes) to a file.
    Automatically creates parent directories if they don't exist.

    Args:
        path: Path where to write.
        data: Data to write (str or bytes).
    """
    full_path = self._resolve_path(path)
    self.mkdir(str(full_path.parent), exist_ok=True)

    if isinstance(data, str):
        mode = "w"
        encoding = "utf-8"
    else:
        mode = "wb"
        encoding = None

    with open(full_path, mode, encoding=encoding) as f:
        f.write(data)

TifProcessor

Handler for TIF data processing and analysis.

Supports advanced operations like merging multiple rasters, reprojection, clipping to geometries, and converting raster data to formats like DataFrames or Graphs.

Attributes:

Name Type Description
dataset_path Union[Path, str, List[Union[Path, str]]]

Path(s) to the TIF file(s).

data_store Optional[DataStore]

DataStore instance for file access.

mode Literal['single', 'rgb', 'rgba', 'multi']

Processing mode ('single', 'rgb', 'rgba', 'multi').

merge_method Literal['first', 'last', 'min', 'max', 'mean']

Method for merging multiple rasters.

target_crs Optional[str]

Optional CRS to reproject to.

resampling_method Resampling

Resampling algorithm to use.

reprojection_resolution Optional[Tuple[float, float]]

Target pixel size for reprojection.

Source code in gigaspatial/processing/tif_processor.py
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class TifProcessor:
    """
    Handler for TIF data processing and analysis.

    Supports advanced operations like merging multiple rasters, reprojection,
    clipping to geometries, and converting raster data to formats like
    DataFrames or Graphs.

    Attributes:
        dataset_path: Path(s) to the TIF file(s).
        data_store: DataStore instance for file access.
        mode: Processing mode ('single', 'rgb', 'rgba', 'multi').
        merge_method: Method for merging multiple rasters.
        target_crs: Optional CRS to reproject to.
        resampling_method: Resampling algorithm to use.
        reprojection_resolution: Target pixel size for reprojection.
    """

    dataset_path: Union[Path, str, List[Union[Path, str]]]
    data_store: Optional[DataStore] = None
    mode: Literal["single", "rgb", "rgba", "multi"] = "single"
    merge_method: Literal["first", "last", "min", "max", "mean"] = "first"
    target_crs: Optional[str] = None  # For reprojection if needed
    resampling_method: Resampling = Resampling.nearest
    reprojection_resolution: Optional[Tuple[float, float]] = None

    def __post_init__(self):
        """Validate inputs, merge rasters if needed, and set up logging."""
        self.data_store = self.data_store or LocalDataStore()
        self.logger = config.get_logger(self.__class__.__name__)
        self._cache = {}
        self._temp_dir = tempfile.mkdtemp()
        self._merged_file_path = None
        self._reprojected_file_path = None
        self._clipped_file_path = None

        # Handle multiple dataset paths
        if isinstance(self.dataset_path, list):
            self.dataset_paths = [Path(p) for p in self.dataset_path]
            self._validate_multiple_datasets()
            self._merge_rasters()
            self.dataset_path = self._merged_file_path
        else:
            self.dataset_paths = [Path(self.dataset_path)]
            # For absolute paths with LocalDataStore, check file existence directly
            # to avoid path resolution issues
            if isinstance(self.data_store, LocalDataStore) and os.path.isabs(
                str(self.dataset_path)
            ):
                if not os.path.exists(str(self.dataset_path)):
                    raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
            elif not self.data_store.file_exists(str(self.dataset_path)):
                raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")

            # Reproject single raster during initialization if target_crs is set
            if self.target_crs:
                self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
                with self.data_store.open(str(self.dataset_path), "rb") as f:
                    with rasterio.MemoryFile(f.read()) as memfile:
                        with memfile.open() as src:
                            self._reprojected_file_path = self._reproject_to_temp_file(
                                src, self.target_crs
                            )
                self.dataset_path = self._reprojected_file_path

        self._load_metadata()
        self._validate_mode_band_compatibility()

    @field_validator("dataset_path")
    def validate_dataset_path(cls, value):
        """Validates that at least one dataset path is provided."""
        if isinstance(value, list):
            if path_len := len(value):
                if path_len == 1:
                    return value[0]
                return value

            raise ValueError("No dataset paths provided.")

        if isinstance(value, (Path, str)):
            return value

    @contextmanager
    def open_dataset(self):
        """
        Context manager for robustly accessing the TIF dataset.

        Automatically handles access to original, merged, reprojected, or
        clipped files across different data stores.

        Yields:
            A rasterio.DatasetReader object.
        """
        if self._merged_file_path:
            with rasterio.open(self._merged_file_path) as src:
                yield src
        elif self._reprojected_file_path:
            with rasterio.open(self._reprojected_file_path) as src:
                yield src
        elif self._clipped_file_path:
            with rasterio.open(self._clipped_file_path) as src:
                yield src
        elif isinstance(self.data_store, LocalDataStore):
            with rasterio.open(str(self.dataset_path)) as src:
                yield src
        else:
            with self.data_store.open(str(self.dataset_path), "rb") as f:
                with rasterio.MemoryFile(f.read()) as memfile:
                    with memfile.open() as src:
                        yield src

    def reproject_to(
        self,
        target_crs: str,
        output_path: Optional[Union[str, Path]] = None,
        resampling_method: Optional[Resampling] = None,
        resolution: Optional[Tuple[float, float]] = None,
    ):
        """
        Reproject the current raster to a new CRS.

        Args:
            target_crs: The destination CRS (e.g., "EPSG:4326").
            output_path: Optional path to save the result. If None, saves to temp.
            resampling_method: Optional override for resampling.
            resolution: Optional target pixel resolution (x, y).

        Returns:
            Path to the reprojected file.
        """
        self.logger.info(f"Reprojecting raster to {target_crs}...")

        # Use provided or default values
        resampling_method = resampling_method or self.resampling_method
        resolution = resolution or self.reprojection_resolution

        with self.open_dataset() as src:
            if src.crs.to_string() == target_crs:
                self.logger.info(
                    "Raster is already in the target CRS. No reprojection needed."
                )
                # If output_path is specified, copy the file
                if output_path:
                    self.data_store.copy_file(str(self.dataset_path), output_path)
                return self.dataset_path

            dst_path = output_path or os.path.join(
                self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
            )

            with rasterio.open(
                dst_path,
                "w",
                **self._get_reprojection_profile(src, target_crs, resolution),
            ) as dst:
                for band_idx in range(1, src.count + 1):
                    reproject(
                        source=rasterio.band(src, band_idx),
                        destination=rasterio.band(dst, band_idx),
                        src_transform=src.transform,
                        src_crs=src.crs,
                        dst_transform=dst.transform,
                        dst_crs=dst.crs,
                        resampling=resampling_method,
                        num_threads=multiprocessing.cpu_count(),
                    )

            self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
            return Path(dst_path)

    def get_raster_info(
        self,
        include_statistics: bool = False,
        approx_ok: bool = False,
    ) -> Dict[str, Any]:
        """
        Get comprehensive metadata and statistics for the raster.

        Args:
            include_statistics: Whether to compute pixel statistics (mean, std, etc.).
            approx_ok: Whether to allow approximate statistics for speed.

        Returns:
            Dictionary containing metadata like dimensions, CRS, bounds, and optionally statistics.
        """
        info = {
            "count": self.count,
            "width": self.width,
            "height": self.height,
            "crs": self.crs,
            "bounds": self.bounds,
            "transform": self.transform,
            "dtypes": self.dtype,
            "nodata": self.nodata,
            "mode": self.mode,
            "is_merged": self.is_merged,
            "source_count": self.source_count,
        }

        if include_statistics:
            info["statistics"] = self._get_basic_statistics(approx_ok=approx_ok)

        return info

    def _reproject_to_temp_file(
        self, src: rasterio.DatasetReader, target_crs: str
    ) -> str:
        """Helper to reproject a raster and save it to a temporary file."""
        dst_path = os.path.join(
            self._temp_dir, f"reprojected_temp_{os.urandom(8).hex()}.tif"
        )
        profile = self._get_reprojection_profile(
            src, target_crs, self.reprojection_resolution
        )

        with rasterio.open(dst_path, "w", **profile) as dst:
            for band_idx in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, band_idx),
                    destination=rasterio.band(dst, band_idx),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst.transform,
                    dst_crs=dst.crs,
                    resampling=self.resampling_method,
                )
        return dst_path

    def _validate_multiple_datasets(self):
        """Validate that all datasets exist and have compatible properties."""
        if len(self.dataset_paths) < 2:
            raise ValueError("Multiple dataset paths required for merging")

        with self.data_store.open(str(self.dataset_paths[0]), "rb") as f:
            with rasterio.MemoryFile(f.read()) as memfile:
                with memfile.open() as ref_src:
                    ref_count = ref_src.count
                    ref_dtype = ref_src.dtypes[0]
                    ref_crs = ref_src.crs
                    ref_transform = ref_src.transform
                    ref_nodata = ref_src.nodata

        for i, path in enumerate(self.dataset_paths[1:], 1):
            with self.data_store.open(str(path), "rb") as f:
                with rasterio.MemoryFile(f.read()) as memfile:
                    with memfile.open() as src:
                        if src.count != ref_count:
                            raise ValueError(
                                f"Dataset {i} has {src.count} bands, expected {ref_count}"
                            )
                        if src.dtypes[0] != ref_dtype:
                            raise ValueError(
                                f"Dataset {i} has dtype {src.dtypes[0]}, expected {ref_dtype}"
                            )
                        if not self.target_crs and src.crs != ref_crs:
                            self.logger.warning(
                                f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. "
                                "Consider setting target_crs parameter for reprojection before merging."
                            )
                        if self.target_crs is None and not self._transforms_compatible(
                            src.transform, ref_transform
                        ):
                            self.logger.warning(
                                f"Dataset {i} has different resolution. Resampling may be needed."
                            )
                        if src.nodata != ref_nodata:
                            self.logger.warning(
                                f"Dataset {i} has different nodata value: {src.nodata} vs {ref_nodata}"
                            )

    def _get_reprojection_profile(
        self,
        src: rasterio.DatasetReader,
        target_crs: str,
        resolution: Optional[Tuple[float, float]],
        compression: str = "lzw",
    ):
        """Calculates and returns the profile for a reprojected raster."""
        if resolution:
            src_res = (abs(src.transform.a), abs(src.transform.e))
            self.logger.info(
                f"Using target resolution: {resolution}. Source resolution: {src_res}."
            )
            # Calculate transform and dimensions based on the new resolution
            dst_transform, width, height = calculate_default_transform(
                src.crs,
                target_crs,
                src.width,
                src.height,
                *src.bounds,
                resolution=resolution,
            )
        else:
            # Keep original resolution but reproject
            dst_transform, width, height = calculate_default_transform(
                src.crs, target_crs, src.width, src.height, *src.bounds
            )

        profile = src.profile.copy()
        profile.update(
            {
                "crs": target_crs,
                "transform": dst_transform,
                "width": width,
                "height": height,
                "compress": compression,  # Add compression to save space
            }
        )
        return profile

    def _transforms_compatible(self, transform1, transform2, tolerance=1e-6):
        """Check if two transforms have compatible pixel sizes."""
        return (
            abs(transform1.a - transform2.a) < tolerance
            and abs(transform1.e - transform2.e) < tolerance
        )

    def _merge_rasters(self):
        """Merge multiple rasters into a single raster."""
        self.logger.info(f"Merging {len(self.dataset_paths)} rasters...")

        # Open all datasets and handle reprojection if needed
        datasets_to_merge = []
        temp_reprojected_files = []
        try:
            for path in self.dataset_paths:
                with self.data_store.open(str(path), "rb") as f:
                    with rasterio.MemoryFile(f.read()) as memfile:
                        with memfile.open() as src:
                            if self.target_crs and src.crs != self.target_crs:
                                self.logger.info(
                                    f"Reprojecting {path.name} to {self.target_crs} before merging."
                                )
                                reprojected_path = self._reproject_to_temp_file(
                                    src, self.target_crs
                                )
                                temp_reprojected_files.append(reprojected_path)
                                datasets_to_merge.append(
                                    rasterio.open(reprojected_path)
                                )
                            else:
                                temp_path = os.path.join(
                                    self._temp_dir,
                                    f"temp_{path.stem}_{os.urandom(4).hex()}.tif",
                                )
                                temp_reprojected_files.append(temp_path)

                                profile = src.profile
                                with rasterio.open(temp_path, "w", **profile) as dst:
                                    dst.write(src.read())
                                datasets_to_merge.append(rasterio.open(temp_path))

            self._merged_file_path = os.path.join(self._temp_dir, "merged_raster.tif")

            if self.merge_method == "mean":
                merged_array, merged_transform = self._merge_with_mean(
                    datasets_to_merge
                )
            else:
                merged_array, merged_transform = merge(
                    datasets_to_merge,
                    method=self.merge_method,
                    resampling=self.resampling_method,
                )

            # Get profile from the first file in the list (all should be compatible now)
            ref_src = datasets_to_merge[0]
            profile = ref_src.profile.copy()
            profile.update(
                {
                    "height": merged_array.shape[-2],
                    "width": merged_array.shape[-1],
                    "transform": merged_transform,
                    "crs": self.target_crs if self.target_crs else ref_src.crs,
                }
            )

            with rasterio.open(self._merged_file_path, "w", **profile) as dst:
                dst.write(merged_array)
        finally:
            for dataset in datasets_to_merge:
                if hasattr(dataset, "close"):
                    dataset.close()

            # Clean up temporary files immediately
            for temp_file in temp_reprojected_files:
                try:
                    os.remove(temp_file)
                except OSError:
                    pass

        self.logger.info("Raster merging completed!")

    def _merge_with_mean(self, src_files):
        """Merge rasters using mean aggregation."""
        # Get bounds and resolution for merged raster
        bounds = src_files[0].bounds
        transform = src_files[0].transform

        for src in src_files[1:]:
            bounds = rasterio.coords.BoundingBox(
                min(bounds.left, src.bounds.left),
                min(bounds.bottom, src.bounds.bottom),
                max(bounds.right, src.bounds.right),
                max(bounds.top, src.bounds.top),
            )

        # Calculate dimensions for merged raster
        width = int((bounds.right - bounds.left) / abs(transform.a))
        height = int((bounds.top - bounds.bottom) / abs(transform.e))

        # Create new transform for merged bounds
        merged_transform = rasterio.transform.from_bounds(
            bounds.left, bounds.bottom, bounds.right, bounds.top, width, height
        )

        estimated_memory = height * width * src_files[0].count * 8  # float64
        if estimated_memory > 1e9:  # 1GB threshold
            self.logger.warning(
                f"Large memory usage expected: {estimated_memory/1e9:.1f}GB"
            )

        # Initialize arrays for sum and count
        sum_array = np.zeros((src_files[0].count, height, width), dtype=np.float64)
        count_array = np.zeros((height, width), dtype=np.int32)

        # Process each source file
        for src in src_files:
            # Read data
            data = src.read()

            # Calculate offset in merged raster
            src_bounds = src.bounds
            col_off = int((src_bounds.left - bounds.left) / abs(transform.a))
            row_off = int((bounds.top - src_bounds.top) / abs(transform.e))

            # Get valid data mask
            if src.nodata is not None:
                valid_mask = data[0] != src.nodata
            else:
                valid_mask = np.ones(data[0].shape, dtype=bool)

            # Add to sum and count arrays
            end_row = row_off + data.shape[1]
            end_col = col_off + data.shape[2]

            sum_array[:, row_off:end_row, col_off:end_col] += np.where(
                valid_mask, data, 0
            )
            count_array[row_off:end_row, col_off:end_col] += valid_mask.astype(np.int32)

        # Calculate mean
        mean_array = np.divide(
            sum_array,
            count_array,
            out=np.full_like(
                sum_array, src_files[0].nodata or 0, dtype=sum_array.dtype
            ),
            where=count_array > 0,
        )

        return mean_array.astype(src_files[0].dtypes[0]), merged_transform

    def _load_metadata(self):
        """Load metadata from the TIF file if not already cached"""
        try:
            with self.open_dataset() as src:
                self._cache["transform"] = src.transform
                self._cache["crs"] = src.crs.to_string()
                self._cache["bounds"] = src.bounds
                self._cache["width"] = src.width
                self._cache["height"] = src.height
                self._cache["resolution"] = (abs(src.transform.a), abs(src.transform.e))
                self._cache["x_transform"] = src.transform.a
                self._cache["y_transform"] = src.transform.e
                self._cache["nodata"] = src.nodata
                self._cache["count"] = src.count
                self._cache["dtype"] = src.dtypes[0]
        except (rasterio.errors.RasterioIOError, FileNotFoundError) as e:
            raise FileNotFoundError(f"Could not read raster metadata: {e}")
        except Exception as e:
            raise RuntimeError(f"Unexpected error loading metadata: {e}")

    @property
    def is_merged(self) -> bool:
        """Check if this processor was created from multiple rasters."""
        return len(self.dataset_paths) > 1

    @property
    def source_count(self) -> int:
        """Get the number of source rasters."""
        return len(self.dataset_paths)

    @property
    def transform(self):
        """Get the transform from the TIF file"""
        return self._cache["transform"]

    @property
    def crs(self):
        """Get the coordinate reference system from the TIF file"""
        return self._cache["crs"]

    @property
    def bounds(self):
        """Get the bounds of the TIF file"""
        return self._cache["bounds"]

    @property
    def resolution(self) -> Tuple[float, float]:
        """Get the x and y resolution (pixel width and height or pixel size) from the TIF file"""
        return self._cache["resolution"]

    @property
    def x_transform(self) -> float:
        """Get the x transform from the TIF file"""
        return self._cache["x_transform"]

    @property
    def y_transform(self) -> float:
        """Get the y transform from the TIF file"""
        return self._cache["y_transform"]

    @property
    def count(self) -> int:
        """Get the band count from the TIF file"""
        return self._cache["count"]

    @property
    def nodata(self) -> int:
        """Get the value representing no data in the rasters"""
        return self._cache["nodata"]

    @property
    def dtype(self):
        """Get the data types from the TIF file"""
        return self._cache.get("dtype", [])

    @property
    def width(self):
        return self._cache["width"]

    @property
    def height(self):
        return self._cache["height"]

    def to_dataframe(
        self,
        drop_nodata=True,
        check_memory=True,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Convert the raster data into a pandas DataFrame.

        Args:
            drop_nodata: If True, pixels with the nodata value are excluded.
            check_memory: If True, checks system memory availability before loading.
            min_value: Optional minimum threshold to filter pixels.
            max_value: Optional maximum threshold to filter pixels.
            **kwargs: Additional arguments like `band_number` or `band_names`.

        Returns:
            A DataFrame with 'lon', 'lat', and band values.

        Raises:
            ValueError: If processing fails due to mode mismatch or invalid data.
        """
        # Memory guard check
        if check_memory:
            self._memory_guard("conversion", threshold_percent=80.0)

        try:
            if self.mode == "single":
                return self._to_dataframe(
                    band_number=kwargs.get("band_number", 1),
                    drop_nodata=drop_nodata,
                    band_names=kwargs.get("band_names", None),
                    min_value=min_value,
                    max_value=max_value,
                )
            else:
                return self._to_dataframe(
                    band_number=None,  # All bands
                    drop_nodata=drop_nodata,
                    band_names=kwargs.get("band_names", None),
                    min_value=min_value,
                    max_value=max_value,
                )
        except Exception as e:
            raise ValueError(
                f"Failed to process TIF file in mode '{self.mode}'. "
                f"Please ensure the file is valid and matches the selected mode. "
                f"Original error: {str(e)}"
            )


    def to_geodataframe(
        self,
        check_memory=True,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Convert the raster data into a GeoDataFrame.

        Each row represents a pixel, with a Point or Box geometry representing
        its spatial extent.

        Args:
            check_memory: If True, checks system memory availability.
            min_value: Optional minimum threshold for pixel values.
            max_value: Optional maximum threshold for pixel values.
            **kwargs: Additional arguments passed to `to_dataframe`.

        Returns:
            A GeoDataFrame containing pixel centroids or boxes and their values.
        """
        # Memory guard check
        if check_memory:
            self._memory_guard("conversion", threshold_percent=80.0)

        # Get filtered DataFrame - geometry creation happens AFTER filtering
        df = self.to_dataframe(
            check_memory=False, min_value=min_value, max_value=max_value, **kwargs
        )

        x_res, y_res = self.resolution

        # create bounding box for each pixel
        geometries = [
            box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
            for lon, lat in zip(df["lon"], df["lat"])
        ]

        gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
        return gdf

    def to_dataframe_chunked(
        self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
    ):
        """
        Convert raster to DataFrame using memory-efficient chunked processing.

        Args:
            drop_nodata: Whether to exclude pixels with the nodata value.
            chunk_size: Specific number of rows per chunk. If None, it is auto-calculated.
            target_memory_mb: Target memory limit per chunk in megabytes.
            **kwargs: Additional arguments like `band_number` or `band_names`.

        Returns:
            A consolidated DataFrame containing all processed chunks.
        """

        if chunk_size is None:
            chunk_size = self._calculate_optimal_chunk_size(
                "conversion", target_memory_mb
            )

        windows = self._get_chunk_windows(chunk_size)

        # SIMPLE ROUTING
        if self.mode == "single":
            return self._to_dataframe_chunked(
                windows,
                band_number=kwargs.get("band_number", 1),
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
            )
        else:  # rgb, rgba, multi
            return self._to_dataframe_chunked(
                windows,
                band_number=None,
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
            )

    def clip_to_geometry(
        self,
        geometry: Union[
            Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
        ],
        crop: bool = True,
        all_touched: bool = True,
        invert: bool = False,
        nodata: Optional[Union[int, float]] = None,
        pad: bool = False,
        pad_width: float = 0.5,
        return_clipped_processor: bool = True,
    ) -> Union["TifProcessor", tuple]:
        """
        Clip the raster to the boundaries of specific geometries.

        Args:
            geometry: The geometry to clip to (Polygon, GDF, GeoSeries, etc.).
            crop: If True, the raster's extent is reduced to the geometry's bounding box.
            all_touched: If True, includes all pixels touched by the geometry.
            invert: If True, masks pixels *inside* the geometry.
            nodata: Override for the nodata value in the output.
            pad: Whether to pad the geometry before clipping.
            pad_width: Width of the padding in pixels.
            return_clipped_processor: If True, returns a new TifProcessor instance.

        Returns:
            A new TifProcessor instance (if return_clipped_processor is True) or
            a tuple of (clipped_array, transform, metadata).

        Raises:
            ValueError: If the geometry does not overlap with the raster or CRS is incompatible.
        """
        # Handle different geometry input types
        shapes = self._prepare_geometry_for_clipping(geometry)

        # Validate CRS compatibility
        self._validate_geometry_crs(geometry)

        # Perform the clipping
        with self.open_dataset() as src:
            try:
                clipped_data, clipped_transform = mask(
                    dataset=src,
                    shapes=shapes,
                    crop=crop,
                    all_touched=all_touched,
                    invert=invert,
                    nodata=nodata,
                    pad=pad,
                    pad_width=pad_width,
                    filled=True,
                )

                # Update metadata for the clipped raster
                clipped_meta = src.meta.copy()
                clipped_meta.update(
                    {
                        "height": clipped_data.shape[1],
                        "width": clipped_data.shape[2],
                        "transform": clipped_transform,
                        "nodata": nodata if nodata is not None else src.nodata,
                    }
                )

            except ValueError as e:
                if "Input shapes do not overlap raster" in str(e):
                    raise ValueError(
                        "The geometry does not overlap with the raster. "
                        "Check that both are in the same coordinate reference system."
                    ) from e
                else:
                    raise e

        if return_clipped_processor:
            # Create a new TifProcessor with the clipped data
            return self._create_clipped_processor(clipped_data, clipped_meta)
        else:
            return clipped_data, clipped_transform, clipped_meta

    def clip_to_bounds(
        self,
        bounds: tuple,
        bounds_crs: Optional[str] = None,
        return_clipped_processor: bool = True,
    ) -> Union["TifProcessor", tuple]:
        """
        Clip the raster to a rectangular bounding box.

        Args:
            bounds: Bounding box as (minx, miny, maxx, maxy).
            bounds_crs: The CRS of the input bounds. Defaults to raster CRS.
            return_clipped_processor: If True, returns a new TifProcessor instance.

        Returns:
            The clipped TifProcessor or tuple of data/metadata.
        """
        # Create bounding box geometry
        bbox_geom = box(*bounds)

        # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
        if bounds_crs is not None:
            raster_crs = self.crs

            if not self.crs == bounds_crs:
                # Create GeoDataFrame with bounds CRS and reproject
                bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
                bbox_gdf = bbox_gdf.to_crs(raster_crs)
                bbox_geom = bbox_gdf.geometry.iloc[0]

        return self.clip_to_geometry(
            geometry=bbox_geom,
            crop=True,
            return_clipped_processor=return_clipped_processor,
        )

    def to_graph(
        self,
        connectivity: Literal[4, 8] = 4,
        band: Optional[int] = None,
        include_coordinates: bool = False,
        graph_type: Literal["networkx", "sparse"] = "networkx",
        check_memory: bool = True,
    ) -> Union[nx.Graph, sp.csr_matrix]:
        """
        Convert the raster into a graph representation based on pixel adjacency.

        Args:
            connectivity: Neighborhood connectivity (4 for von Neumann, 8 for Moore).
            band: Band number to use for node values (1-indexed).
            include_coordinates: If True, adds 'x' and 'y' attributes to nodes.
            graph_type: Output type ('networkx' for Graph object, 'sparse' for CSR matrix).
            check_memory: If True, validates memory availability before processing.

        Returns:
            A NetworkX Graph or a SciPy sparse CSR matrix.
        """

        # Memory guard check
        if check_memory:
            self._memory_guard("graph", threshold_percent=80.0)

        with self.open_dataset() as src:
            band_idx = band - 1 if band is not None else 0
            if band_idx < 0 or band_idx >= src.count:
                raise ValueError(
                    f"Band {band} not available. Raster has {src.count} bands"
                )

            data = src.read(band_idx + 1)
            nodata = src.nodata if src.nodata is not None else self.nodata
            valid_mask = (
                data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
            )

            height, width = data.shape

            # Find all valid pixels
            valid_rows, valid_cols = np.where(valid_mask)
            num_valid_pixels = len(valid_rows)

            # Create a sequential mapping from (row, col) to a node ID
            node_map = np.full(data.shape, -1, dtype=int)
            node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)

            # Define neighborhood offsets
            if connectivity == 4:
                # von Neumann neighborhood (4-connectivity)
                offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
            else:  # connectivity == 8
                # Moore neighborhood (8-connectivity)
                offsets = [
                    (-1, -1),
                    (-1, 0),
                    (-1, 1),
                    (0, -1),
                    (0, 1),
                    (1, -1),
                    (1, 0),
                    (1, 1),
                ]

            # Collect nodes and edges
            nodes_to_add = []
            edges_to_add = []

            for i in range(num_valid_pixels):
                row, col = valid_rows[i], valid_cols[i]
                current_node_id = node_map[row, col]

                # Prepare node attributes
                node_attrs = {"value": float(data[row, col])}
                if include_coordinates:
                    x, y = src.xy(row, col)
                    node_attrs["x"] = x
                    node_attrs["y"] = y
                nodes_to_add.append((current_node_id, node_attrs))

                # Find neighbors and collect edges
                for dy, dx in offsets:
                    neighbor_row, neighbor_col = row + dy, col + dx

                    # Check if neighbor is within bounds and is a valid pixel
                    if (
                        0 <= neighbor_row < height
                        and 0 <= neighbor_col < width
                        and valid_mask[neighbor_row, neighbor_col]
                    ):
                        neighbor_node_id = node_map[neighbor_row, neighbor_col]

                        # Ensure each edge is added only once
                        if current_node_id < neighbor_node_id:
                            neighbor_value = float(data[neighbor_row, neighbor_col])
                            edges_to_add.append(
                                (current_node_id, neighbor_node_id, neighbor_value)
                            )

            if graph_type == "networkx":
                G = nx.Graph()
                G.add_nodes_from(nodes_to_add)
                G.add_weighted_edges_from(edges_to_add)
                return G
            else:  # sparse matrix
                edges_array = np.array(edges_to_add)
                row_indices = edges_array[:, 0]
                col_indices = edges_array[:, 1]
                weights = edges_array[:, 2]

                # Add reverse edges for symmetric matrix
                from_idx = np.append(row_indices, col_indices)
                to_idx = np.append(col_indices, row_indices)
                weights = np.append(weights, weights)

                return sp.coo_matrix(
                    (weights, (from_idx, to_idx)),
                    shape=(num_valid_pixels, num_valid_pixels),
                ).tocsr()

    def sample_by_coordinates(
        self, coordinate_list: List[Tuple[float, float]], **kwargs
    ) -> Union[np.ndarray, dict]:
        """
        Extract raster values at specific point coordinates.

        Args:
            coordinate_list: List of (longitude, latitude) tuples.
            **kwargs: Additional arguments passed to rasterio.sample.

        Returns:
            Numpy array of values (single-band) or dict of band values (RGB/RGBA).
        """
        self.logger.info("Sampling raster values at the coordinates...")

        with self.open_dataset() as src:
            if self.mode == "rgba":
                if self.count != 4:
                    raise ValueError("RGBA mode requires a 4-band TIF file")

                rgba_values = {"red": [], "green": [], "blue": [], "alpha": []}

                for band_idx, color in enumerate(["red", "green", "blue", "alpha"], 1):
                    rgba_values[color] = [
                        vals[0]
                        for vals in src.sample(coordinate_list, indexes=band_idx)
                    ]

                return rgba_values

            elif self.mode == "rgb":
                if self.count != 3:
                    raise ValueError("RGB mode requires a 3-band TIF file")

                rgb_values = {"red": [], "green": [], "blue": []}

                for band_idx, color in enumerate(["red", "green", "blue"], 1):
                    rgb_values[color] = [
                        vals[0]
                        for vals in src.sample(coordinate_list, indexes=band_idx)
                    ]

                return rgb_values
            elif self.count > 1:
                return np.array(
                    [vals for vals in src.sample(coordinate_list, **kwargs)]
                )
            else:
                return np.array([vals[0] for vals in src.sample(coordinate_list)])

    def sample_by_polygons(
        self,
        polygon_list,
        stat: Union[str, Callable, List[Union[str, Callable]]] = "mean",
    ):
        """
        Sample raster values within polygons and compute aggregate statistics.

        Args:
            polygon_list: List of Shapely Polygon or MultiPolygon objects.
            stat: Statistic(s) to compute. Can be a string (e.g., 'mean'),
                  a callable, or a list of both.

        Returns:
            Numpy array of results (if single stat) or a list of dictionaries (if multi-stat).
        """
        # Determine if single or multiple stats
        single_stat = not isinstance(stat, list)
        stats_list = [stat] if single_stat else stat

        # Prepare stat functions
        stat_funcs = []
        stat_names = []

        for s in stats_list:
            if callable(s):
                stat_funcs.append(s)
                stat_names.append(
                    s.__name__
                    if hasattr(s, "__name__")
                    else f"custom_{len(stat_names)}"
                )
            else:
                # Handle string statistics
                if s == "count":
                    stat_funcs.append(len)
                else:
                    stat_funcs.append(getattr(np, s))
                stat_names.append(s)

        results = []

        with self.open_dataset() as src:
            for polygon in tqdm(polygon_list):
                try:
                    out_image, _ = mask(src, [polygon], crop=True, filled=False)

                    # Use masked arrays for more efficient nodata handling
                    if hasattr(out_image, "mask"):
                        valid_data = out_image.compressed()
                    else:
                        valid_data = (
                            out_image[out_image != self.nodata]
                            if self.nodata
                            else out_image.flatten()
                        )

                    if len(valid_data) == 0:
                        if single_stat:
                            results.append(np.nan)
                        else:
                            results.append({name: np.nan for name in stat_names})
                    else:
                        if single_stat:
                            results.append(stat_funcs[0](valid_data))
                        else:
                            # Compute all statistics for this polygon
                            polygon_stats = {}
                            for func, name in zip(stat_funcs, stat_names):
                                try:
                                    polygon_stats[name] = func(valid_data)
                                except Exception:
                                    polygon_stats[name] = np.nan
                            results.append(polygon_stats)

                except Exception:
                    if single_stat:
                        results.append(np.nan)
                    else:
                        results.append({name: np.nan for name in stat_names})

        return np.array(results) if single_stat else results

    def sample_by_polygons_batched(
        self,
        polygon_list: List[Union[Polygon, MultiPolygon]],
        stat: Union[str, Callable] = "mean",
        batch_size: int = 100,
        n_workers: int = 4,
        show_progress: bool = True,
        check_memory: bool = True,
        **kwargs,
    ) -> np.ndarray:
        """
        Sample raster values by polygons in parallel using batch processing.

        Efficiently distributes sampling tasks across multiple worker processes.

        Args:
            polygon_list: List of Shapely Polygon or MultiPolygon objects.
            stat: Statistic to compute for each polygon.
            batch_size: Number of polygons to process in each worker batch.
            n_workers: Number of parallel processes to use.
            show_progress: If True, displays a progress bar.
            check_memory: If True, validates memory availability before starting.
            **kwargs: Additional arguments.

        Returns:
            Numpy array of statistics for each polygon.
        """
        import sys

        # Memory guard check with n_workers consideration
        if check_memory:
            is_safe = self._memory_guard(
                "batched_sampling",
                threshold_percent=85.0,
                n_workers=n_workers,
                raise_error=False,
            )

            if not is_safe:
                # Suggest reducing n_workers
                memory_info = self._check_available_memory()
                estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)

                # Calculate optimal workers
                suggested_workers = max(
                    1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
                )

                warnings.warn(
                    f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
                    f"to reduce memory pressure.",
                    ResourceWarning,
                )

        # Platform check
        if sys.platform in ["win32", "darwin"]:
            import warnings
            import multiprocessing as mp

            if mp.get_start_method(allow_none=True) != "fork":
                warnings.warn(
                    "Batched sampling may not work on Windows/macOS. "
                    "Use sample_by_polygons() if you encounter errors.",
                    RuntimeWarning,
                )

        def _chunk_list(data_list, chunk_size):
            """Yield successive chunks from data_list."""
            for i in range(0, len(data_list), chunk_size):
                yield data_list[i : i + chunk_size]

        if len(polygon_list) == 0:
            return np.array([])

        stat_func = stat if callable(stat) else getattr(np, stat)
        polygon_chunks = list(_chunk_list(polygon_list, batch_size))

        with multiprocessing.Pool(
            initializer=self._initializer_worker, processes=n_workers
        ) as pool:
            process_func = partial(self._process_polygon_batch, stat_func=stat_func)
            if show_progress:
                batched_results = list(
                    tqdm(
                        pool.imap(process_func, polygon_chunks),
                        total=len(polygon_chunks),
                        desc=f"Sampling polygons",
                    )
                )
            else:
                batched_results = list(pool.imap(process_func, polygon_chunks))

            results = [item for sublist in batched_results for item in sublist]

        return np.array(results)

    def _initializer_worker(self):
        """
        Initializer function for each worker process.
        Opens the raster dataset and stores it in a process-local variable.
        This function runs once per worker, not for every task.
        """
        global src_handle, memfile_handle

        # Priority: merged > reprojected > original (same as open_dataset)
        local_file_path = None
        if self._merged_file_path:
            # Merged file is a local temp file
            local_file_path = self._merged_file_path
        elif self._reprojected_file_path:
            # Reprojected file is a local temp file
            local_file_path = self._reprojected_file_path
        elif isinstance(self.data_store, LocalDataStore):
            # Local file - can open directly
            local_file_path = str(self.dataset_path)

        if local_file_path:
            # Open local file directly
            with open(local_file_path, "rb") as f:
                memfile_handle = rasterio.MemoryFile(f.read())
                src_handle = memfile_handle.open()
        else:
            # Custom DataStore
            with self.data_store.open(str(self.dataset_path), "rb") as f:
                memfile_handle = rasterio.MemoryFile(f.read())
                src_handle = memfile_handle.open()

    def _get_worker_dataset(self):
        """Get dataset handle for worker process."""
        global src_handle
        if src_handle is None:
            raise RuntimeError("Raster dataset not initialized in this process.")
        return src_handle

    def _process_single_polygon(self, polygon, stat_func):
        """
        Helper function to process a single polygon.
        This will be run in a separate process.
        """
        try:
            src = self._get_worker_dataset()
            out_image, _ = mask(src, [polygon], crop=True, filled=False)

            if hasattr(out_image, "mask"):
                valid_data = out_image.compressed()
            else:
                valid_data = (
                    out_image[out_image != self.nodata]
                    if self.nodata
                    else out_image.flatten()
                )

            return stat_func(valid_data) if len(valid_data) > 0 else np.nan
        except RuntimeError as e:
            self.logger.error(f"Worker not initialized: {e}")
            return np.nan
        except Exception as e:
            self.logger.debug(f"Error processing polygon: {e}")
            return np.nan

    def _process_polygon_batch(self, polygon_batch, stat_func):
        """
        Processes a batch of polygons.
        """
        return [
            self._process_single_polygon(polygon, stat_func)
            for polygon in polygon_batch
        ]

    def _to_dataframe(
        self,
        band_number: Optional[int] = None,
        drop_nodata: bool = True,
        band_names: Optional[Union[str, List[str]]] = None,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
    ) -> pd.DataFrame:
        """
        Process TIF to DataFrame - handles both single-band and multi-band.

        Args:
            band_number: Specific band to read (1-indexed). If None, reads all bands.
            drop_nodata: Whether to drop nodata values
            band_names: Custom names for bands (multi-band only)
            min_value: Minimum threshold for pixel values (exclusive)
            max_value: Maximum threshold for pixel values (exclusive)

        Returns:
            pd.DataFrame with lon, lat, and band value(s) filtered according to drop_nodata, min_value and max_value
        """
        with self.open_dataset() as src:
            if band_number is not None:
                # SINGLE BAND MODE
                band = src.read(band_number)
                nodata_value = src.nodata if src.nodata is not None else self.nodata

                # Build mask combining nodata and value thresholds
                mask = self._build_data_mask(
                    band, drop_nodata, nodata_value, min_value, max_value
                )

                # Extract coordinates and values with mask
                lons, lats = self._extract_coordinates_with_mask(mask)
                values = np.extract(mask, band) if mask is not None else band.flatten()

                band_name = (
                    band_names
                    if isinstance(band_names, str)
                    else (
                        band_names[band_number]
                        if isinstance(band_names, list)
                        else "pixel_value"
                    )
                )

                return pd.DataFrame({"lon": lons, "lat": lats, band_name: values})
            else:
                # MULTI-BAND MODE (all bands)
                stack = src.read()
                nodata_value = src.nodata if src.nodata is not None else self.nodata

                # Auto-detect band names by mode
                if band_names is None:
                    if self.mode == "rgb":
                        band_names = ["red", "green", "blue"]
                    elif self.mode == "rgba":
                        band_names = ["red", "green", "blue", "alpha"]
                    else:
                        band_names = [
                            src.descriptions[i] or f"band_{i+1}"
                            for i in range(self.count)
                        ]
                # Build mask combining nodata and value thresholds
                mask = self._build_multi_band_mask(
                    stack, drop_nodata, nodata_value, min_value, max_value
                )

                # Create DataFrame
                data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
                df = pd.DataFrame(data_dict)

                # RGBA: normalize alpha if needed
                if (
                    self.mode == "rgba"
                    and "alpha" in df.columns
                    and df["alpha"].max() > 1
                ):
                    df["alpha"] = df["alpha"] / 255.0

            return df

    def _to_dataframe_chunked(
        self,
        windows: List[rasterio.windows.Window],
        band_number: Optional[int] = None,
        drop_nodata: bool = True,
        band_names: Optional[Union[str, List[str]]] = None,
        show_progress: bool = True,
    ) -> pd.DataFrame:
        """Universal chunked converter for ALL modes."""

        chunks = []
        iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows

        with self.open_dataset() as src:
            # Auto-detect band names ONCE (before loop)
            if band_number is None and band_names is None:
                if self.mode == "rgb":
                    band_names = ["red", "green", "blue"]
                elif self.mode == "rgba":
                    band_names = ["red", "green", "blue", "alpha"]
                else:  # multi
                    band_names = [
                        src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
                    ]

            for window in iterator:
                if band_number is not None:
                    # SINGLE BAND
                    band_chunk = src.read(band_number, window=window)
                    mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
                    lons, lats = self._get_chunk_coordinates(window, src)
                    band_name = (
                        band_names if isinstance(band_names, str) else "pixel_value"
                    )

                    # Build chunk DataFrame (could use helper but simple enough)
                    if mask is not None:
                        mask_flat = mask.flatten()
                        chunk_df = pd.DataFrame(
                            {
                                "lon": lons[mask_flat],
                                "lat": lats[mask_flat],
                                band_name: band_chunk.flatten()[mask_flat],
                            }
                        )
                    else:
                        chunk_df = pd.DataFrame(
                            {"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
                        )
                else:
                    # MULTI-BAND (includes RGB/RGBA)
                    stack_chunk = src.read(window=window)
                    mask = self._build_multi_band_mask(
                        stack_chunk, drop_nodata, src.nodata
                    )
                    lons, lats = self._get_chunk_coordinates(window, src)

                    # Build DataFrame using helper
                    band_dict = {
                        band_names[i]: stack_chunk[i] for i in range(self.count)
                    }
                    chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)

                    # RGBA: normalize alpha
                    if self.mode == "rgba" and "alpha" in chunk_df.columns:
                        if chunk_df["alpha"].max() > 1:
                            chunk_df["alpha"] = chunk_df["alpha"] / 255.0

                chunks.append(chunk_df)

        result = pd.concat(chunks, ignore_index=True)
        return result

    def _prepare_geometry_for_clipping(
        self,
        geometry: Union[
            Polygon,
            MultiPolygon,
            MultiPoint,
            gpd.GeoDataFrame,
            gpd.GeoSeries,
            List[dict],
            dict,
        ],
    ) -> List[dict]:
        """Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""

        if isinstance(geometry, MultiPoint):
            # Use bounding box of MultiPoint
            minx, miny, maxx, maxy = geometry.bounds
            bbox = box(minx, miny, maxx, maxy)
            return [bbox.__geo_interface__]

        if isinstance(geometry, (Polygon, MultiPolygon)):
            # Shapely geometry
            return [geometry.__geo_interface__]

        elif isinstance(geometry, gpd.GeoDataFrame):
            # GeoDataFrame - use all geometries
            return [
                geom.__geo_interface__ for geom in geometry.geometry if geom is not None
            ]

        elif isinstance(geometry, gpd.GeoSeries):
            # GeoSeries
            return [geom.__geo_interface__ for geom in geometry if geom is not None]

        elif isinstance(geometry, dict):
            # Single GeoJSON-like dict
            return [geometry]

        elif isinstance(geometry, list):
            # List of GeoJSON-like dicts
            return geometry

        else:
            raise TypeError(
                f"Unsupported geometry type: {type(geometry)}. "
                "Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
                "GeoJSON-like dict, or list of GeoJSON-like dicts."
            )

    def _validate_geometry_crs(
        self,
        original_geometry: Any,
    ) -> None:
        """Validate that geometry CRS matches raster CRS"""

        # Get raster CRS
        raster_crs = self.crs

        # Try to get geometry CRS
        geometry_crs = None

        if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
            geometry_crs = original_geometry.crs
        elif hasattr(original_geometry, "crs"):
            geometry_crs = original_geometry.crs

        # Warn if CRS mismatch detected
        if geometry_crs is not None and raster_crs is not None:
            if not raster_crs == geometry_crs:
                self.logger.warning(
                    f"CRS mismatch detected! Raster CRS: {raster_crs}, "
                    f"Geometry CRS: {geometry_crs}. "
                    "Consider reprojecting geometry to match raster CRS for accurate clipping."
                )

    def _create_clipped_processor(
        self, clipped_data: np.ndarray, clipped_meta: dict
    ) -> "TifProcessor":
        """
        Helper to create a new TifProcessor instance from clipped data.
        Saves the clipped data to a temporary file and initializes a new TifProcessor.
        """
        # Create a temporary placeholder file to initialize the processor
        # This allows us to get the processor's temp_dir
        placeholder_dir = tempfile.mkdtemp()
        placeholder_path = os.path.join(
            placeholder_dir, f"placeholder_{os.urandom(8).hex()}.tif"
        )

        # Create a minimal valid TIF file as placeholder
        placeholder_transform = rasterio.transform.from_bounds(0, 0, 1, 1, 1, 1)
        with rasterio.open(
            placeholder_path,
            "w",
            driver="GTiff",
            width=1,
            height=1,
            count=1,
            dtype="uint8",
            crs="EPSG:4326",
            transform=placeholder_transform,
        ) as dst:
            dst.write(np.zeros((1, 1, 1), dtype="uint8"))

        # Create a new TifProcessor instance with the placeholder
        # ALWAYS use LocalDataStore() for local temp paths, even if self.data_store is different
        new_processor = TifProcessor(
            dataset_path=placeholder_path,
            data_store=LocalDataStore(),
            mode=self.mode,
        )

        # Now save the clipped file directly to the new processor's temp directory
        clipped_file_path = os.path.join(
            new_processor._temp_dir, f"clipped_{os.urandom(8).hex()}.tif"
        )

        with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
            dst.write(clipped_data)

        # Verify file was created successfully
        if not os.path.exists(clipped_file_path):
            raise RuntimeError(f"Failed to create clipped file at {clipped_file_path}")

        # Set the clipped file path and update processor attributes
        new_processor._clipped_file_path = clipped_file_path
        new_processor.dataset_path = clipped_file_path
        new_processor.dataset_paths = [Path(clipped_file_path)]

        # Restore original data_store to the new processor
        new_processor.data_store = self.data_store

        # Clean up placeholder file and directory
        try:
            os.remove(placeholder_path)
            os.rmdir(placeholder_dir)
        except OSError:
            pass

        # Reload metadata since the path changed
        new_processor._load_metadata()

        return new_processor


    def _get_basic_statistics(self, approx_ok: bool = False) -> Dict[str, Any]:
        """
        Compute per-band statistics (min, max, mean, std, sum, count).

        Args:
            approx_ok: Whether to allow approximate statistics.

        Returns:
            Dictionary containing per-band and overall statistics.
        """
        cache_key = "statistics_exact"
        if cache_key in self._cache:
            return self._cache[cache_key]

        if approx_ok:
            self.logger.debug(
                "approx_ok requested for statistics, but only exact statistics are supported."
            )

        band_stats: List[Dict[str, Union[int, float, None]]] = []
        overall = {
            "min": None,
            "max": None,
            "mean": None,
            "std": None,
            "sum": 0.0,
            "count": 0,
        }

        with self.open_dataset() as src:
            nodata_value = src.nodata if src.nodata is not None else self.nodata
            total_sum = 0.0
            total_sq_sum = 0.0
            total_count = 0

            for band_idx in range(1, src.count + 1):
                band_min = None
                band_max = None
                band_sum = 0.0
                band_sq_sum = 0.0
                band_count = 0

                for _, window in src.block_windows(bidx=band_idx):
                    block = src.read(band_idx, window=window, masked=False)

                    if nodata_value is not None:
                        valid_mask = block != nodata_value
                        if not np.any(valid_mask):
                            continue
                        valid = block[valid_mask]
                    else:
                        valid = block

                    valid = valid.astype(np.float64, copy=False)
                    if valid.size == 0:
                        continue

                    block_min = float(valid.min())
                    block_max = float(valid.max())
                    block_sum = float(valid.sum())
                    block_sq_sum = float(np.square(valid, dtype=np.float64).sum())
                    block_count = int(valid.size)

                    band_min = (
                        block_min if band_min is None else min(band_min, block_min)
                    )
                    band_max = (
                        block_max if band_max is None else max(band_max, block_max)
                    )
                    band_sum += block_sum
                    band_sq_sum += block_sq_sum
                    band_count += block_count

                if band_count == 0:
                    band_stats.append(
                        {
                            "band": band_idx,
                            "min": None,
                            "max": None,
                            "mean": None,
                            "std": None,
                            "sum": 0.0,
                            "count": 0,
                        }
                    )
                    continue

                band_mean = band_sum / band_count
                variance = max((band_sq_sum / band_count) - band_mean**2, 0.0)
                band_std = variance**0.5

                band_stats.append(
                    {
                        "band": band_idx,
                        "min": band_min,
                        "max": band_max,
                        "mean": band_mean,
                        "std": band_std,
                        "sum": band_sum,
                        "count": band_count,
                    }
                )

                overall["min"] = (
                    band_min
                    if overall["min"] is None
                    else min(overall["min"], band_min)
                )
                overall["max"] = (
                    band_max
                    if overall["max"] is None
                    else max(overall["max"], band_max)
                )
                total_sum += band_sum
                total_sq_sum += band_sq_sum
                total_count += band_count

            if total_count > 0:
                overall["sum"] = total_sum
                overall["count"] = total_count
                overall["mean"] = total_sum / total_count
                overall_variance = max(
                    (total_sq_sum / total_count) - overall["mean"] ** 2, 0.0
                )
                overall["std"] = overall_variance**0.5

        result = {
            "bands": band_stats,
            "overall": overall,
            "approximate": False,
        }

        self._cache[cache_key] = result
        return result

    def _get_pixel_coordinates(self):
        """Helper method to generate coordinate arrays for all pixels"""
        if "pixel_coords" not in self._cache:
            # use cached values
            bounds = self._cache["bounds"]
            width = self._cache["width"]
            height = self._cache["height"]
            pixel_size_x = self._cache["x_transform"]
            pixel_size_y = self._cache["y_transform"]

            self._cache["pixel_coords"] = np.meshgrid(
                np.linspace(
                    bounds.left + pixel_size_x / 2,
                    bounds.right - pixel_size_x / 2,
                    width,
                ),
                np.linspace(
                    bounds.top + pixel_size_y / 2,
                    bounds.bottom - pixel_size_y / 2,
                    height,
                ),
            )

        return self._cache["pixel_coords"]

    def _get_chunk_coordinates(self, window, src):
        """Get coordinates for a specific window chunk."""
        transform = src.window_transform(window)
        rows, cols = np.meshgrid(
            np.arange(window.height), np.arange(window.width), indexing="ij"
        )
        xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
        return np.array(xs), np.array(ys)

    def _extract_coordinates_with_mask(self, mask=None):
        """Extract flattened coordinates, optionally applying a mask."""
        x_coords, y_coords = self._get_pixel_coordinates()

        if mask is not None:
            return np.extract(mask, x_coords), np.extract(mask, y_coords)

        return x_coords.flatten(), y_coords.flatten()

    def _build_data_mask(
        self,
        data: np.ndarray,
        drop_nodata: bool = True,
        nodata_value: Optional[float] = None,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
    ) -> Optional[np.ndarray]:
        """
        Build a boolean mask for filtering data based on nodata and value thresholds.

        Args:
            Data array to mask
            drop_no Whether to drop nodata values
            nodata_value: The nodata value to filter
            min_value: Minimum value threshold (exclusive)
            max_value: Maximum value threshold (exclusive)

        Returns:
            Boolean mask or None if no masking needed
        """
        masks = []

        # Nodata mask
        if drop_nodata and nodata_value is not None:
            masks.append(data != nodata_value)

        # Min value threshold
        if min_value is not None:
            masks.append(data > min_value)

        # Max value threshold
        if max_value is not None:
            masks.append(data < max_value)

        if not masks:
            return None

        # Combine all masks with AND logic
        combined_mask = masks[0]
        for mask in masks[1:]:
            combined_mask &= mask

        return combined_mask

    def _build_multi_band_mask(
        self,
        bands: np.ndarray,
        drop_nodata: bool = True,
        nodata_value: Optional[float] = None,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
    ) -> Optional[np.ndarray]:
        """
        Build mask for multi-band data.

        Drops pixels where ANY band has nodata or fails value thresholds.

        Args:
            bands: 3D array of shape (nbands, height, width)
            drop_no Whether to drop nodata values
            nodata_value: The nodata value to check
            min_value: Minimum value threshold (exclusive)
            max_value: Maximum value threshold (exclusive)

        Returns:
            Boolean mask or None if no masking needed
        """
        masks = []

        # Nodata mask - any band has nodata
        if drop_nodata and nodata_value is not None:
            has_nodata = np.any(bands == nodata_value, axis=0)
            masks.append(~has_nodata)

        # Value threshold masks - any band fails threshold
        if min_value is not None:
            below_min = np.any(bands <= min_value, axis=0)
            masks.append(~below_min)

        if max_value is not None:
            above_max = np.any(bands >= max_value, axis=0)
            masks.append(~above_max)

        if not masks:
            return None

        # Combine all masks with AND logic
        combined_mask = masks[0]
        for mask in masks[1:]:
            combined_mask &= mask

        return combined_mask

    def _bands_to_dict(self, bands, band_count, band_names, mask=None):
        """Read specified bands and return as a dictionary with optional masking."""

        lons, lats = self._extract_coordinates_with_mask(mask)
        data_dict = {"lon": lons, "lat": lats}

        for idx, name in enumerate(band_names[:band_count]):
            band_data = bands[idx]
            data_dict[name] = (
                np.extract(mask, band_data) if mask is not None else band_data.flatten()
            )

        return data_dict

    def _calculate_optimal_chunk_size(
        self, operation: str = "conversion", target_memory_mb: int = 500
    ) -> int:
        """
        Calculate optimal chunk size based on target memory usage.

        Args:
            operation: Type of operation ('conversion', 'graph').
            target_memory_mb: Target memory per chunk in megabytes.

        Returns:
            Optimal number of rows per chunk.
        """
        bytes_per_element = np.dtype(self.dtype).itemsize
        n_bands = self.count
        width = self.width

        # Adjust for operation type
        if operation == "conversion":
            # DataFrame overhead is roughly 2x
            bytes_per_row = width * n_bands * bytes_per_element * 2
        elif operation == "graph":
            # Graph needs additional space for edges
            bytes_per_row = width * bytes_per_element * 4  # Estimate
        else:
            bytes_per_row = width * n_bands * bytes_per_element

        target_bytes = target_memory_mb * 1024 * 1024
        chunk_rows = max(1, int(target_bytes / bytes_per_row))

        # Ensure chunk size doesn't exceed total height
        chunk_rows = min(chunk_rows, self.height)

        self.logger.info(
            f"Calculated chunk size: {chunk_rows} rows "
            f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
        )

        return chunk_rows

    def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
        """
        Generate window objects for chunked reading.

        Args:
            chunk_size: Number of rows per chunk

        Returns:
            List of rasterio.windows.Window objects
        """
        windows = []
        for row_start in range(0, self.height, chunk_size):
            row_end = min(row_start + chunk_size, self.height)
            window = rasterio.windows.Window(
                col_off=0,
                row_off=row_start,
                width=self.width,
                height=row_end - row_start,
            )
            windows.append(window)

        return windows

    def _format_bytes(self, bytes_value: int) -> str:
        """Convert bytes to human-readable format."""
        for unit in ["B", "KB", "MB", "GB", "TB"]:
            if bytes_value < 1024.0:
                return f"{bytes_value:.2f} {unit}"
            bytes_value /= 1024.0
        return f"{bytes_value:.2f} PB"

    def _check_available_memory(self) -> dict:
        """
        Check available system memory.

        Returns:
            Dict with total, available, and used memory info
        """
        import psutil

        memory = psutil.virtual_memory()
        return {
            "total": memory.total,
            "available": memory.available,
            "used": memory.used,
            "percent": memory.percent,
            "available_human": self._format_bytes(memory.available),
        }

    def _estimate_memory_usage(
        self, operation: str = "conversion", n_workers: int = 1
    ) -> dict:
        """
        Estimate memory usage for various operations.

        Args:
            operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
            n_workers: Number of workers (for batched_sampling)

        Returns:
            Dict with estimated memory usage in bytes and human-readable format
        """
        bytes_per_element = np.dtype(self.dtype).itemsize
        n_pixels = self.width * self.height
        n_bands = self.count

        estimates = {}

        if operation == "conversion":
            # to_dataframe/to_geodataframe: full raster + DataFrame overhead
            raster_memory = n_pixels * n_bands * bytes_per_element
            # DataFrame overhead (roughly 2x for storage + processing)
            dataframe_memory = (
                n_pixels * n_bands * 16
            )  # 16 bytes per value in DataFrame
            total = raster_memory + dataframe_memory
            estimates["raster"] = raster_memory
            estimates["dataframe"] = dataframe_memory
            estimates["total"] = total

        elif operation == "batched_sampling":
            # Each worker loads full raster into MemoryFile
            # Need to get file size
            if self._merged_file_path:
                file_path = self._merged_file_path
            elif self._reprojected_file_path:
                file_path = self._reprojected_file_path
            else:
                file_path = str(self.dataset_path)

            try:
                import os

                file_size = os.path.getsize(file_path)
            except:
                # Estimate if can't get file size
                file_size = n_pixels * n_bands * bytes_per_element * 1.2  # Add overhead

            estimates["per_worker"] = file_size
            estimates["total"] = file_size * n_workers

        elif operation == "merge":
            # _merge_with_mean uses float64 arrays
            raster_memory = n_pixels * n_bands * 8  # float64
            estimates["sum_array"] = raster_memory
            estimates["count_array"] = n_pixels * 4  # int32
            estimates["total"] = raster_memory + n_pixels * 4

        elif operation == "graph":
            # to_graph: data + node_map + edges
            data_memory = n_pixels * bytes_per_element
            node_map_memory = n_pixels * 4  # int32
            # Estimate edges (rough: 4-connectivity = 4 edges per pixel)
            edges_memory = n_pixels * 4 * 3 * 8  # 3 values per edge, float64
            total = data_memory + node_map_memory + edges_memory
            estimates["data"] = data_memory
            estimates["node_map"] = node_map_memory
            estimates["edges"] = edges_memory
            estimates["total"] = total

        # Add human-readable format
        estimates["human_readable"] = self._format_bytes(estimates["total"])

        return estimates

    def _memory_guard(
        self,
        operation: str,
        threshold_percent: float = 80.0,
        n_workers: Optional[int] = None,
        raise_error: bool = False,
    ) -> bool:
        """
        Check if an operation is safe to perform given memory constraints.

        Args:
            operation: Type of operation to check.
            threshold_percent: Maximum % of available memory to use.
            n_workers: Number of workers (for parallel operations).
            raise_error: If True, raises MemoryError if unsafe.

        Returns:
            True if the operation is deemed safe, False otherwise.

        Raises:
            MemoryError: If raise_error is True and memory is insufficient.
        """
        import warnings

        estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
        memory_info = self._check_available_memory()

        estimated_usage = estimates["total"]
        available = memory_info["available"]
        threshold = available * (threshold_percent / 100.0)

        is_safe = estimated_usage <= threshold

        if not is_safe:
            usage_str = self._format_bytes(estimated_usage)
            available_str = memory_info["available_human"]

            message = (
                f"Memory warning: {operation} operation may require {usage_str} "
                f"but only {available_str} is available. "
                f"Current memory usage: {memory_info['percent']:.1f}%"
            )

            if raise_error:
                raise MemoryError(message)
            else:
                warnings.warn(message, ResourceWarning)
                if hasattr(self, "logger"):
                    self.logger.warning(message)

        return is_safe

    def _validate_mode_band_compatibility(self):
        """Validate that mode matches band count."""
        mode_requirements = {
            "single": (1, "1-band"),
            "rgb": (3, "3-band"),
            "rgba": (4, "4-band"),
        }

        if self.mode in mode_requirements:
            required_count, description = mode_requirements[self.mode]
            if self.count != required_count:
                raise ValueError(
                    f"{self.mode.upper()} mode requires a {description} TIF file"
                )
        elif self.mode == "multi" and self.count < 2:
            raise ValueError("Multi mode requires a TIF file with 2 or more bands")

    def save_to_file(
        self,
        output_path: Union[str, Path],
        compress: Optional[str] = "LZW",
        tiled: bool = True,
        blocksize: int = 512,
        bigtiff: Optional[str] = None,
        predictor: Optional[int] = None,
        num_threads: Optional[int] = None,
        cog: bool = False,
        overviews: Optional[List[int]] = None,
        overview_resampling: str = "nearest",
        **kwargs,
    ) -> Path:
        """
        Export the raster to a file with optimized settings.

        Args:
            output_path: Output file path.
            compress: Compression method (e.g., 'LZW', 'ZSTD').
            tiled: If True, tiles the output for better performance.
            blocksize: Block size for tiled output.
            bigtiff: 'YES', 'NO', or 'IF_NEEDED' for large files.
            predictor: Compression predictor (2 for int, 3 for float).
            num_threads: Number of threads for compression.
            cog: If True, creates a Cloud-Optimized GeoTIFF.
            overviews: Overview levels for COG.
            overview_resampling: Resampling method for overviews.
            **kwargs: Additional creation options for rasterio.

        Returns:
            Path to the saved TIF file.
        """
        output_path = Path(output_path)

        # Build creation options
        creation_options = {}

        if compress and compress.upper() != "NONE":
            creation_options["compress"] = compress.upper()

        if tiled:
            creation_options["tiled"] = True
            creation_options["blockxsize"] = blocksize
            creation_options["blockysize"] = blocksize

        if bigtiff:
            creation_options["BIGTIFF"] = bigtiff

        if predictor is not None:
            creation_options["predictor"] = predictor

        if num_threads is not None:
            creation_options["NUM_THREADS"] = num_threads

        # Add compression-specific options
        if compress:
            if compress.upper() == "DEFLATE" and "ZLEVEL" not in kwargs:
                kwargs["ZLEVEL"] = 6  # Default compression level
            elif compress.upper() == "ZSTD" and "ZSTD_LEVEL" not in kwargs:
                kwargs["ZSTD_LEVEL"] = 9  # Default compression level
            elif compress.upper() == "JPEG" and "JPEG_QUALITY" not in kwargs:
                kwargs["JPEG_QUALITY"] = 85  # Default quality
            elif compress.upper() == "WEBP" and "WEBP_LEVEL" not in kwargs:
                kwargs["WEBP_LEVEL"] = 75  # Default quality

        # Merge additional kwargs
        creation_options.update(kwargs)

        # Write to temporary file first (rasterio requires local file)
        with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            # Use open_dataset context manager - handles merged/reprojected/clipped files automatically
            with self.open_dataset() as src:
                profile = src.profile.copy()
                profile.update(**creation_options)

                with rasterio.open(tmp_path, "w", **profile) as dst:
                    # Write all bands
                    for band_idx in range(1, src.count + 1):
                        data = src.read(band_idx)
                        dst.write(data, band_idx)

                    # Add overviews if requested
                    if overviews or cog:
                        if overviews is None:
                            # Auto-generate overview levels for COG
                            overviews = [2, 4, 8, 16]
                        dst.build_overviews(
                            overviews, getattr(Resampling, overview_resampling)
                        )

                    # Update tags to indicate COG if requested
                    if cog:
                        dst.update_tags(LAYOUT="COG")

            # Write through data store
            with open(tmp_path, "rb") as f:
                file_content = f.read()

            self.data_store.write_file(str(output_path), file_content)

            self.logger.info(f"Raster saved to {output_path}")

        finally:
            # Clean up temporary file
            try:
                os.remove(tmp_path)
            except OSError:
                pass

        return output_path

    def save_array_to_file(
        self,
        array: np.ndarray,
        output_path: Union[str, Path],
        compress: Optional[str] = "LZW",
        tiled: bool = True,
        blocksize: int = 512,
        crs: Optional[Any] = None,
        transform: Optional[Any] = None,
        nodata: Optional[float] = None,
        **kwargs,
    ) -> Path:
        """
        Save a numpy array to a raster file using metadata from this processor.

        Args:
            array: 2D or 3D array of data to save.
            output_path: Destination file path.
            compress: Compression method.
            tiled: If True, tiles the output.
            blocksize: Block size for tiled output.
            crs: Optional CRS override.
            transform: Optional Affine transform override.
            nodata: Optional nodata value override.
            **kwargs: Additional creation options.

        Returns:
            Path to the saved TIF file.
        """
        output_path = Path(output_path)

        # Ensure array is at least 3D
        if array.ndim == 2:
            array = array[np.newaxis, :, :]
        elif array.ndim != 3:
            raise ValueError(f"Array must be 2D or 3D, got shape {array.shape}")

        num_bands = array.shape[0]
        height = array.shape[1]
        width = array.shape[2]

        # Get metadata from source using open_dataset
        with self.open_dataset() as src:
            if crs is None:
                crs = src.crs
            if transform is None:
                transform = src.transform
            if nodata is None:
                nodata = src.nodata
            dtype = array.dtype

        # Build profile
        profile = {
            "driver": "GTiff",
            "height": height,
            "width": width,
            "count": num_bands,
            "dtype": dtype,
            "crs": crs,
            "transform": transform,
            "nodata": nodata,
        }

        # Add creation options
        if compress and compress.upper() != "NONE":
            profile["compress"] = compress.upper()
        if tiled:
            profile["tiled"] = True
            profile["blockxsize"] = blocksize
            profile["blockysize"] = blocksize

        profile.update(kwargs)

        # Write to temporary file first
        with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            # Write the file - write all bands
            with rasterio.open(tmp_path, "w", **profile) as dst:
                for band_idx in range(num_bands):
                    dst.write(array[band_idx], band_idx + 1)

            # Write through data store
            with open(tmp_path, "rb") as f:
                file_content = f.read()

            self.data_store.write_file(str(output_path), file_content)

            self.logger.info(f"Array saved to {output_path}")

        finally:
            # Clean up temporary file
            try:
                os.remove(tmp_path)
            except OSError:
                pass

        return output_path

    def __enter__(self):
        return self

    def __del__(self):
        """Clean up temporary files and directories."""
        if (
            hasattr(self, "_temp_dir")
            and self._temp_dir
            and os.path.exists(self._temp_dir)
        ):
            shutil.rmtree(self._temp_dir, ignore_errors=True)

    def cleanup(self):
        """Explicit cleanup method for better control."""
        if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
            shutil.rmtree(self._temp_dir)
            self.logger.info("Cleaned up temporary files")

    def __exit__(self, *args):
        """Proper context manager exit with cleanup."""
        self.cleanup()

bounds property

Get the bounds of the TIF file

count: int property

Get the band count from the TIF file

crs property

Get the coordinate reference system from the TIF file

dtype property

Get the data types from the TIF file

is_merged: bool property

Check if this processor was created from multiple rasters.

nodata: int property

Get the value representing no data in the rasters

resolution: Tuple[float, float] property

Get the x and y resolution (pixel width and height or pixel size) from the TIF file

source_count: int property

Get the number of source rasters.

transform property

Get the transform from the TIF file

x_transform: float property

Get the x transform from the TIF file

y_transform: float property

Get the y transform from the TIF file

__del__()

Clean up temporary files and directories.

Source code in gigaspatial/processing/tif_processor.py
def __del__(self):
    """Clean up temporary files and directories."""
    if (
        hasattr(self, "_temp_dir")
        and self._temp_dir
        and os.path.exists(self._temp_dir)
    ):
        shutil.rmtree(self._temp_dir, ignore_errors=True)

__exit__(*args)

Proper context manager exit with cleanup.

Source code in gigaspatial/processing/tif_processor.py
def __exit__(self, *args):
    """Proper context manager exit with cleanup."""
    self.cleanup()

__post_init__()

Validate inputs, merge rasters if needed, and set up logging.

Source code in gigaspatial/processing/tif_processor.py
def __post_init__(self):
    """Validate inputs, merge rasters if needed, and set up logging."""
    self.data_store = self.data_store or LocalDataStore()
    self.logger = config.get_logger(self.__class__.__name__)
    self._cache = {}
    self._temp_dir = tempfile.mkdtemp()
    self._merged_file_path = None
    self._reprojected_file_path = None
    self._clipped_file_path = None

    # Handle multiple dataset paths
    if isinstance(self.dataset_path, list):
        self.dataset_paths = [Path(p) for p in self.dataset_path]
        self._validate_multiple_datasets()
        self._merge_rasters()
        self.dataset_path = self._merged_file_path
    else:
        self.dataset_paths = [Path(self.dataset_path)]
        # For absolute paths with LocalDataStore, check file existence directly
        # to avoid path resolution issues
        if isinstance(self.data_store, LocalDataStore) and os.path.isabs(
            str(self.dataset_path)
        ):
            if not os.path.exists(str(self.dataset_path)):
                raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
        elif not self.data_store.file_exists(str(self.dataset_path)):
            raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")

        # Reproject single raster during initialization if target_crs is set
        if self.target_crs:
            self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
            with self.data_store.open(str(self.dataset_path), "rb") as f:
                with rasterio.MemoryFile(f.read()) as memfile:
                    with memfile.open() as src:
                        self._reprojected_file_path = self._reproject_to_temp_file(
                            src, self.target_crs
                        )
            self.dataset_path = self._reprojected_file_path

    self._load_metadata()
    self._validate_mode_band_compatibility()

cleanup()

Explicit cleanup method for better control.

Source code in gigaspatial/processing/tif_processor.py
def cleanup(self):
    """Explicit cleanup method for better control."""
    if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
        shutil.rmtree(self._temp_dir)
        self.logger.info("Cleaned up temporary files")

clip_to_bounds(bounds, bounds_crs=None, return_clipped_processor=True)

Clip the raster to a rectangular bounding box.

Parameters:

Name Type Description Default
bounds tuple

Bounding box as (minx, miny, maxx, maxy).

required
bounds_crs Optional[str]

The CRS of the input bounds. Defaults to raster CRS.

None
return_clipped_processor bool

If True, returns a new TifProcessor instance.

True

Returns:

Type Description
Union[TifProcessor, tuple]

The clipped TifProcessor or tuple of data/metadata.

Source code in gigaspatial/processing/tif_processor.py
def clip_to_bounds(
    self,
    bounds: tuple,
    bounds_crs: Optional[str] = None,
    return_clipped_processor: bool = True,
) -> Union["TifProcessor", tuple]:
    """
    Clip the raster to a rectangular bounding box.

    Args:
        bounds: Bounding box as (minx, miny, maxx, maxy).
        bounds_crs: The CRS of the input bounds. Defaults to raster CRS.
        return_clipped_processor: If True, returns a new TifProcessor instance.

    Returns:
        The clipped TifProcessor or tuple of data/metadata.
    """
    # Create bounding box geometry
    bbox_geom = box(*bounds)

    # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
    if bounds_crs is not None:
        raster_crs = self.crs

        if not self.crs == bounds_crs:
            # Create GeoDataFrame with bounds CRS and reproject
            bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
            bbox_gdf = bbox_gdf.to_crs(raster_crs)
            bbox_geom = bbox_gdf.geometry.iloc[0]

    return self.clip_to_geometry(
        geometry=bbox_geom,
        crop=True,
        return_clipped_processor=return_clipped_processor,
    )

clip_to_geometry(geometry, crop=True, all_touched=True, invert=False, nodata=None, pad=False, pad_width=0.5, return_clipped_processor=True)

Clip the raster to the boundaries of specific geometries.

Parameters:

Name Type Description Default
geometry Union[Polygon, MultiPolygon, GeoDataFrame, GeoSeries, List[dict], dict]

The geometry to clip to (Polygon, GDF, GeoSeries, etc.).

required
crop bool

If True, the raster's extent is reduced to the geometry's bounding box.

True
all_touched bool

If True, includes all pixels touched by the geometry.

True
invert bool

If True, masks pixels inside the geometry.

False
nodata Optional[Union[int, float]]

Override for the nodata value in the output.

None
pad bool

Whether to pad the geometry before clipping.

False
pad_width float

Width of the padding in pixels.

0.5
return_clipped_processor bool

If True, returns a new TifProcessor instance.

True

Returns:

Type Description
Union[TifProcessor, tuple]

A new TifProcessor instance (if return_clipped_processor is True) or

Union[TifProcessor, tuple]

a tuple of (clipped_array, transform, metadata).

Raises:

Type Description
ValueError

If the geometry does not overlap with the raster or CRS is incompatible.

Source code in gigaspatial/processing/tif_processor.py
def clip_to_geometry(
    self,
    geometry: Union[
        Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
    ],
    crop: bool = True,
    all_touched: bool = True,
    invert: bool = False,
    nodata: Optional[Union[int, float]] = None,
    pad: bool = False,
    pad_width: float = 0.5,
    return_clipped_processor: bool = True,
) -> Union["TifProcessor", tuple]:
    """
    Clip the raster to the boundaries of specific geometries.

    Args:
        geometry: The geometry to clip to (Polygon, GDF, GeoSeries, etc.).
        crop: If True, the raster's extent is reduced to the geometry's bounding box.
        all_touched: If True, includes all pixels touched by the geometry.
        invert: If True, masks pixels *inside* the geometry.
        nodata: Override for the nodata value in the output.
        pad: Whether to pad the geometry before clipping.
        pad_width: Width of the padding in pixels.
        return_clipped_processor: If True, returns a new TifProcessor instance.

    Returns:
        A new TifProcessor instance (if return_clipped_processor is True) or
        a tuple of (clipped_array, transform, metadata).

    Raises:
        ValueError: If the geometry does not overlap with the raster or CRS is incompatible.
    """
    # Handle different geometry input types
    shapes = self._prepare_geometry_for_clipping(geometry)

    # Validate CRS compatibility
    self._validate_geometry_crs(geometry)

    # Perform the clipping
    with self.open_dataset() as src:
        try:
            clipped_data, clipped_transform = mask(
                dataset=src,
                shapes=shapes,
                crop=crop,
                all_touched=all_touched,
                invert=invert,
                nodata=nodata,
                pad=pad,
                pad_width=pad_width,
                filled=True,
            )

            # Update metadata for the clipped raster
            clipped_meta = src.meta.copy()
            clipped_meta.update(
                {
                    "height": clipped_data.shape[1],
                    "width": clipped_data.shape[2],
                    "transform": clipped_transform,
                    "nodata": nodata if nodata is not None else src.nodata,
                }
            )

        except ValueError as e:
            if "Input shapes do not overlap raster" in str(e):
                raise ValueError(
                    "The geometry does not overlap with the raster. "
                    "Check that both are in the same coordinate reference system."
                ) from e
            else:
                raise e

    if return_clipped_processor:
        # Create a new TifProcessor with the clipped data
        return self._create_clipped_processor(clipped_data, clipped_meta)
    else:
        return clipped_data, clipped_transform, clipped_meta

get_raster_info(include_statistics=False, approx_ok=False)

Get comprehensive metadata and statistics for the raster.

Parameters:

Name Type Description Default
include_statistics bool

Whether to compute pixel statistics (mean, std, etc.).

False
approx_ok bool

Whether to allow approximate statistics for speed.

False

Returns:

Type Description
Dict[str, Any]

Dictionary containing metadata like dimensions, CRS, bounds, and optionally statistics.

Source code in gigaspatial/processing/tif_processor.py
def get_raster_info(
    self,
    include_statistics: bool = False,
    approx_ok: bool = False,
) -> Dict[str, Any]:
    """
    Get comprehensive metadata and statistics for the raster.

    Args:
        include_statistics: Whether to compute pixel statistics (mean, std, etc.).
        approx_ok: Whether to allow approximate statistics for speed.

    Returns:
        Dictionary containing metadata like dimensions, CRS, bounds, and optionally statistics.
    """
    info = {
        "count": self.count,
        "width": self.width,
        "height": self.height,
        "crs": self.crs,
        "bounds": self.bounds,
        "transform": self.transform,
        "dtypes": self.dtype,
        "nodata": self.nodata,
        "mode": self.mode,
        "is_merged": self.is_merged,
        "source_count": self.source_count,
    }

    if include_statistics:
        info["statistics"] = self._get_basic_statistics(approx_ok=approx_ok)

    return info

open_dataset()

Context manager for robustly accessing the TIF dataset.

Automatically handles access to original, merged, reprojected, or clipped files across different data stores.

Yields:

Type Description

A rasterio.DatasetReader object.

Source code in gigaspatial/processing/tif_processor.py
@contextmanager
def open_dataset(self):
    """
    Context manager for robustly accessing the TIF dataset.

    Automatically handles access to original, merged, reprojected, or
    clipped files across different data stores.

    Yields:
        A rasterio.DatasetReader object.
    """
    if self._merged_file_path:
        with rasterio.open(self._merged_file_path) as src:
            yield src
    elif self._reprojected_file_path:
        with rasterio.open(self._reprojected_file_path) as src:
            yield src
    elif self._clipped_file_path:
        with rasterio.open(self._clipped_file_path) as src:
            yield src
    elif isinstance(self.data_store, LocalDataStore):
        with rasterio.open(str(self.dataset_path)) as src:
            yield src
    else:
        with self.data_store.open(str(self.dataset_path), "rb") as f:
            with rasterio.MemoryFile(f.read()) as memfile:
                with memfile.open() as src:
                    yield src

reproject_to(target_crs, output_path=None, resampling_method=None, resolution=None)

Reproject the current raster to a new CRS.

Parameters:

Name Type Description Default
target_crs str

The destination CRS (e.g., "EPSG:4326").

required
output_path Optional[Union[str, Path]]

Optional path to save the result. If None, saves to temp.

None
resampling_method Optional[Resampling]

Optional override for resampling.

None
resolution Optional[Tuple[float, float]]

Optional target pixel resolution (x, y).

None

Returns:

Type Description

Path to the reprojected file.

Source code in gigaspatial/processing/tif_processor.py
def reproject_to(
    self,
    target_crs: str,
    output_path: Optional[Union[str, Path]] = None,
    resampling_method: Optional[Resampling] = None,
    resolution: Optional[Tuple[float, float]] = None,
):
    """
    Reproject the current raster to a new CRS.

    Args:
        target_crs: The destination CRS (e.g., "EPSG:4326").
        output_path: Optional path to save the result. If None, saves to temp.
        resampling_method: Optional override for resampling.
        resolution: Optional target pixel resolution (x, y).

    Returns:
        Path to the reprojected file.
    """
    self.logger.info(f"Reprojecting raster to {target_crs}...")

    # Use provided or default values
    resampling_method = resampling_method or self.resampling_method
    resolution = resolution or self.reprojection_resolution

    with self.open_dataset() as src:
        if src.crs.to_string() == target_crs:
            self.logger.info(
                "Raster is already in the target CRS. No reprojection needed."
            )
            # If output_path is specified, copy the file
            if output_path:
                self.data_store.copy_file(str(self.dataset_path), output_path)
            return self.dataset_path

        dst_path = output_path or os.path.join(
            self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
        )

        with rasterio.open(
            dst_path,
            "w",
            **self._get_reprojection_profile(src, target_crs, resolution),
        ) as dst:
            for band_idx in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, band_idx),
                    destination=rasterio.band(dst, band_idx),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst.transform,
                    dst_crs=dst.crs,
                    resampling=resampling_method,
                    num_threads=multiprocessing.cpu_count(),
                )

        self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
        return Path(dst_path)

sample_by_coordinates(coordinate_list, **kwargs)

Extract raster values at specific point coordinates.

Parameters:

Name Type Description Default
coordinate_list List[Tuple[float, float]]

List of (longitude, latitude) tuples.

required
**kwargs

Additional arguments passed to rasterio.sample.

{}

Returns:

Type Description
Union[ndarray, dict]

Numpy array of values (single-band) or dict of band values (RGB/RGBA).

Source code in gigaspatial/processing/tif_processor.py
def sample_by_coordinates(
    self, coordinate_list: List[Tuple[float, float]], **kwargs
) -> Union[np.ndarray, dict]:
    """
    Extract raster values at specific point coordinates.

    Args:
        coordinate_list: List of (longitude, latitude) tuples.
        **kwargs: Additional arguments passed to rasterio.sample.

    Returns:
        Numpy array of values (single-band) or dict of band values (RGB/RGBA).
    """
    self.logger.info("Sampling raster values at the coordinates...")

    with self.open_dataset() as src:
        if self.mode == "rgba":
            if self.count != 4:
                raise ValueError("RGBA mode requires a 4-band TIF file")

            rgba_values = {"red": [], "green": [], "blue": [], "alpha": []}

            for band_idx, color in enumerate(["red", "green", "blue", "alpha"], 1):
                rgba_values[color] = [
                    vals[0]
                    for vals in src.sample(coordinate_list, indexes=band_idx)
                ]

            return rgba_values

        elif self.mode == "rgb":
            if self.count != 3:
                raise ValueError("RGB mode requires a 3-band TIF file")

            rgb_values = {"red": [], "green": [], "blue": []}

            for band_idx, color in enumerate(["red", "green", "blue"], 1):
                rgb_values[color] = [
                    vals[0]
                    for vals in src.sample(coordinate_list, indexes=band_idx)
                ]

            return rgb_values
        elif self.count > 1:
            return np.array(
                [vals for vals in src.sample(coordinate_list, **kwargs)]
            )
        else:
            return np.array([vals[0] for vals in src.sample(coordinate_list)])

sample_by_polygons(polygon_list, stat='mean')

Sample raster values within polygons and compute aggregate statistics.

Parameters:

Name Type Description Default
polygon_list

List of Shapely Polygon or MultiPolygon objects.

required
stat Union[str, Callable, List[Union[str, Callable]]]

Statistic(s) to compute. Can be a string (e.g., 'mean'), a callable, or a list of both.

'mean'

Returns:

Type Description

Numpy array of results (if single stat) or a list of dictionaries (if multi-stat).

Source code in gigaspatial/processing/tif_processor.py
def sample_by_polygons(
    self,
    polygon_list,
    stat: Union[str, Callable, List[Union[str, Callable]]] = "mean",
):
    """
    Sample raster values within polygons and compute aggregate statistics.

    Args:
        polygon_list: List of Shapely Polygon or MultiPolygon objects.
        stat: Statistic(s) to compute. Can be a string (e.g., 'mean'),
              a callable, or a list of both.

    Returns:
        Numpy array of results (if single stat) or a list of dictionaries (if multi-stat).
    """
    # Determine if single or multiple stats
    single_stat = not isinstance(stat, list)
    stats_list = [stat] if single_stat else stat

    # Prepare stat functions
    stat_funcs = []
    stat_names = []

    for s in stats_list:
        if callable(s):
            stat_funcs.append(s)
            stat_names.append(
                s.__name__
                if hasattr(s, "__name__")
                else f"custom_{len(stat_names)}"
            )
        else:
            # Handle string statistics
            if s == "count":
                stat_funcs.append(len)
            else:
                stat_funcs.append(getattr(np, s))
            stat_names.append(s)

    results = []

    with self.open_dataset() as src:
        for polygon in tqdm(polygon_list):
            try:
                out_image, _ = mask(src, [polygon], crop=True, filled=False)

                # Use masked arrays for more efficient nodata handling
                if hasattr(out_image, "mask"):
                    valid_data = out_image.compressed()
                else:
                    valid_data = (
                        out_image[out_image != self.nodata]
                        if self.nodata
                        else out_image.flatten()
                    )

                if len(valid_data) == 0:
                    if single_stat:
                        results.append(np.nan)
                    else:
                        results.append({name: np.nan for name in stat_names})
                else:
                    if single_stat:
                        results.append(stat_funcs[0](valid_data))
                    else:
                        # Compute all statistics for this polygon
                        polygon_stats = {}
                        for func, name in zip(stat_funcs, stat_names):
                            try:
                                polygon_stats[name] = func(valid_data)
                            except Exception:
                                polygon_stats[name] = np.nan
                        results.append(polygon_stats)

            except Exception:
                if single_stat:
                    results.append(np.nan)
                else:
                    results.append({name: np.nan for name in stat_names})

    return np.array(results) if single_stat else results

sample_by_polygons_batched(polygon_list, stat='mean', batch_size=100, n_workers=4, show_progress=True, check_memory=True, **kwargs)

Sample raster values by polygons in parallel using batch processing.

Efficiently distributes sampling tasks across multiple worker processes.

Parameters:

Name Type Description Default
polygon_list List[Union[Polygon, MultiPolygon]]

List of Shapely Polygon or MultiPolygon objects.

required
stat Union[str, Callable]

Statistic to compute for each polygon.

'mean'
batch_size int

Number of polygons to process in each worker batch.

100
n_workers int

Number of parallel processes to use.

4
show_progress bool

If True, displays a progress bar.

True
check_memory bool

If True, validates memory availability before starting.

True
**kwargs

Additional arguments.

{}

Returns:

Type Description
ndarray

Numpy array of statistics for each polygon.

Source code in gigaspatial/processing/tif_processor.py
def sample_by_polygons_batched(
    self,
    polygon_list: List[Union[Polygon, MultiPolygon]],
    stat: Union[str, Callable] = "mean",
    batch_size: int = 100,
    n_workers: int = 4,
    show_progress: bool = True,
    check_memory: bool = True,
    **kwargs,
) -> np.ndarray:
    """
    Sample raster values by polygons in parallel using batch processing.

    Efficiently distributes sampling tasks across multiple worker processes.

    Args:
        polygon_list: List of Shapely Polygon or MultiPolygon objects.
        stat: Statistic to compute for each polygon.
        batch_size: Number of polygons to process in each worker batch.
        n_workers: Number of parallel processes to use.
        show_progress: If True, displays a progress bar.
        check_memory: If True, validates memory availability before starting.
        **kwargs: Additional arguments.

    Returns:
        Numpy array of statistics for each polygon.
    """
    import sys

    # Memory guard check with n_workers consideration
    if check_memory:
        is_safe = self._memory_guard(
            "batched_sampling",
            threshold_percent=85.0,
            n_workers=n_workers,
            raise_error=False,
        )

        if not is_safe:
            # Suggest reducing n_workers
            memory_info = self._check_available_memory()
            estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)

            # Calculate optimal workers
            suggested_workers = max(
                1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
            )

            warnings.warn(
                f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
                f"to reduce memory pressure.",
                ResourceWarning,
            )

    # Platform check
    if sys.platform in ["win32", "darwin"]:
        import warnings
        import multiprocessing as mp

        if mp.get_start_method(allow_none=True) != "fork":
            warnings.warn(
                "Batched sampling may not work on Windows/macOS. "
                "Use sample_by_polygons() if you encounter errors.",
                RuntimeWarning,
            )

    def _chunk_list(data_list, chunk_size):
        """Yield successive chunks from data_list."""
        for i in range(0, len(data_list), chunk_size):
            yield data_list[i : i + chunk_size]

    if len(polygon_list) == 0:
        return np.array([])

    stat_func = stat if callable(stat) else getattr(np, stat)
    polygon_chunks = list(_chunk_list(polygon_list, batch_size))

    with multiprocessing.Pool(
        initializer=self._initializer_worker, processes=n_workers
    ) as pool:
        process_func = partial(self._process_polygon_batch, stat_func=stat_func)
        if show_progress:
            batched_results = list(
                tqdm(
                    pool.imap(process_func, polygon_chunks),
                    total=len(polygon_chunks),
                    desc=f"Sampling polygons",
                )
            )
        else:
            batched_results = list(pool.imap(process_func, polygon_chunks))

        results = [item for sublist in batched_results for item in sublist]

    return np.array(results)

save_array_to_file(array, output_path, compress='LZW', tiled=True, blocksize=512, crs=None, transform=None, nodata=None, **kwargs)

Save a numpy array to a raster file using metadata from this processor.

Parameters:

Name Type Description Default
array ndarray

2D or 3D array of data to save.

required
output_path Union[str, Path]

Destination file path.

required
compress Optional[str]

Compression method.

'LZW'
tiled bool

If True, tiles the output.

True
blocksize int

Block size for tiled output.

512
crs Optional[Any]

Optional CRS override.

None
transform Optional[Any]

Optional Affine transform override.

None
nodata Optional[float]

Optional nodata value override.

None
**kwargs

Additional creation options.

{}

Returns:

Type Description
Path

Path to the saved TIF file.

Source code in gigaspatial/processing/tif_processor.py
def save_array_to_file(
    self,
    array: np.ndarray,
    output_path: Union[str, Path],
    compress: Optional[str] = "LZW",
    tiled: bool = True,
    blocksize: int = 512,
    crs: Optional[Any] = None,
    transform: Optional[Any] = None,
    nodata: Optional[float] = None,
    **kwargs,
) -> Path:
    """
    Save a numpy array to a raster file using metadata from this processor.

    Args:
        array: 2D or 3D array of data to save.
        output_path: Destination file path.
        compress: Compression method.
        tiled: If True, tiles the output.
        blocksize: Block size for tiled output.
        crs: Optional CRS override.
        transform: Optional Affine transform override.
        nodata: Optional nodata value override.
        **kwargs: Additional creation options.

    Returns:
        Path to the saved TIF file.
    """
    output_path = Path(output_path)

    # Ensure array is at least 3D
    if array.ndim == 2:
        array = array[np.newaxis, :, :]
    elif array.ndim != 3:
        raise ValueError(f"Array must be 2D or 3D, got shape {array.shape}")

    num_bands = array.shape[0]
    height = array.shape[1]
    width = array.shape[2]

    # Get metadata from source using open_dataset
    with self.open_dataset() as src:
        if crs is None:
            crs = src.crs
        if transform is None:
            transform = src.transform
        if nodata is None:
            nodata = src.nodata
        dtype = array.dtype

    # Build profile
    profile = {
        "driver": "GTiff",
        "height": height,
        "width": width,
        "count": num_bands,
        "dtype": dtype,
        "crs": crs,
        "transform": transform,
        "nodata": nodata,
    }

    # Add creation options
    if compress and compress.upper() != "NONE":
        profile["compress"] = compress.upper()
    if tiled:
        profile["tiled"] = True
        profile["blockxsize"] = blocksize
        profile["blockysize"] = blocksize

    profile.update(kwargs)

    # Write to temporary file first
    with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        # Write the file - write all bands
        with rasterio.open(tmp_path, "w", **profile) as dst:
            for band_idx in range(num_bands):
                dst.write(array[band_idx], band_idx + 1)

        # Write through data store
        with open(tmp_path, "rb") as f:
            file_content = f.read()

        self.data_store.write_file(str(output_path), file_content)

        self.logger.info(f"Array saved to {output_path}")

    finally:
        # Clean up temporary file
        try:
            os.remove(tmp_path)
        except OSError:
            pass

    return output_path

save_to_file(output_path, compress='LZW', tiled=True, blocksize=512, bigtiff=None, predictor=None, num_threads=None, cog=False, overviews=None, overview_resampling='nearest', **kwargs)

Export the raster to a file with optimized settings.

Parameters:

Name Type Description Default
output_path Union[str, Path]

Output file path.

required
compress Optional[str]

Compression method (e.g., 'LZW', 'ZSTD').

'LZW'
tiled bool

If True, tiles the output for better performance.

True
blocksize int

Block size for tiled output.

512
bigtiff Optional[str]

'YES', 'NO', or 'IF_NEEDED' for large files.

None
predictor Optional[int]

Compression predictor (2 for int, 3 for float).

None
num_threads Optional[int]

Number of threads for compression.

None
cog bool

If True, creates a Cloud-Optimized GeoTIFF.

False
overviews Optional[List[int]]

Overview levels for COG.

None
overview_resampling str

Resampling method for overviews.

'nearest'
**kwargs

Additional creation options for rasterio.

{}

Returns:

Type Description
Path

Path to the saved TIF file.

Source code in gigaspatial/processing/tif_processor.py
def save_to_file(
    self,
    output_path: Union[str, Path],
    compress: Optional[str] = "LZW",
    tiled: bool = True,
    blocksize: int = 512,
    bigtiff: Optional[str] = None,
    predictor: Optional[int] = None,
    num_threads: Optional[int] = None,
    cog: bool = False,
    overviews: Optional[List[int]] = None,
    overview_resampling: str = "nearest",
    **kwargs,
) -> Path:
    """
    Export the raster to a file with optimized settings.

    Args:
        output_path: Output file path.
        compress: Compression method (e.g., 'LZW', 'ZSTD').
        tiled: If True, tiles the output for better performance.
        blocksize: Block size for tiled output.
        bigtiff: 'YES', 'NO', or 'IF_NEEDED' for large files.
        predictor: Compression predictor (2 for int, 3 for float).
        num_threads: Number of threads for compression.
        cog: If True, creates a Cloud-Optimized GeoTIFF.
        overviews: Overview levels for COG.
        overview_resampling: Resampling method for overviews.
        **kwargs: Additional creation options for rasterio.

    Returns:
        Path to the saved TIF file.
    """
    output_path = Path(output_path)

    # Build creation options
    creation_options = {}

    if compress and compress.upper() != "NONE":
        creation_options["compress"] = compress.upper()

    if tiled:
        creation_options["tiled"] = True
        creation_options["blockxsize"] = blocksize
        creation_options["blockysize"] = blocksize

    if bigtiff:
        creation_options["BIGTIFF"] = bigtiff

    if predictor is not None:
        creation_options["predictor"] = predictor

    if num_threads is not None:
        creation_options["NUM_THREADS"] = num_threads

    # Add compression-specific options
    if compress:
        if compress.upper() == "DEFLATE" and "ZLEVEL" not in kwargs:
            kwargs["ZLEVEL"] = 6  # Default compression level
        elif compress.upper() == "ZSTD" and "ZSTD_LEVEL" not in kwargs:
            kwargs["ZSTD_LEVEL"] = 9  # Default compression level
        elif compress.upper() == "JPEG" and "JPEG_QUALITY" not in kwargs:
            kwargs["JPEG_QUALITY"] = 85  # Default quality
        elif compress.upper() == "WEBP" and "WEBP_LEVEL" not in kwargs:
            kwargs["WEBP_LEVEL"] = 75  # Default quality

    # Merge additional kwargs
    creation_options.update(kwargs)

    # Write to temporary file first (rasterio requires local file)
    with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        # Use open_dataset context manager - handles merged/reprojected/clipped files automatically
        with self.open_dataset() as src:
            profile = src.profile.copy()
            profile.update(**creation_options)

            with rasterio.open(tmp_path, "w", **profile) as dst:
                # Write all bands
                for band_idx in range(1, src.count + 1):
                    data = src.read(band_idx)
                    dst.write(data, band_idx)

                # Add overviews if requested
                if overviews or cog:
                    if overviews is None:
                        # Auto-generate overview levels for COG
                        overviews = [2, 4, 8, 16]
                    dst.build_overviews(
                        overviews, getattr(Resampling, overview_resampling)
                    )

                # Update tags to indicate COG if requested
                if cog:
                    dst.update_tags(LAYOUT="COG")

        # Write through data store
        with open(tmp_path, "rb") as f:
            file_content = f.read()

        self.data_store.write_file(str(output_path), file_content)

        self.logger.info(f"Raster saved to {output_path}")

    finally:
        # Clean up temporary file
        try:
            os.remove(tmp_path)
        except OSError:
            pass

    return output_path

to_dataframe(drop_nodata=True, check_memory=True, min_value=None, max_value=None, **kwargs)

Convert the raster data into a pandas DataFrame.

Parameters:

Name Type Description Default
drop_nodata

If True, pixels with the nodata value are excluded.

True
check_memory

If True, checks system memory availability before loading.

True
min_value Optional[float]

Optional minimum threshold to filter pixels.

None
max_value Optional[float]

Optional maximum threshold to filter pixels.

None
**kwargs

Additional arguments like band_number or band_names.

{}

Returns:

Type Description
DataFrame

A DataFrame with 'lon', 'lat', and band values.

Raises:

Type Description
ValueError

If processing fails due to mode mismatch or invalid data.

Source code in gigaspatial/processing/tif_processor.py
def to_dataframe(
    self,
    drop_nodata=True,
    check_memory=True,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Convert the raster data into a pandas DataFrame.

    Args:
        drop_nodata: If True, pixels with the nodata value are excluded.
        check_memory: If True, checks system memory availability before loading.
        min_value: Optional minimum threshold to filter pixels.
        max_value: Optional maximum threshold to filter pixels.
        **kwargs: Additional arguments like `band_number` or `band_names`.

    Returns:
        A DataFrame with 'lon', 'lat', and band values.

    Raises:
        ValueError: If processing fails due to mode mismatch or invalid data.
    """
    # Memory guard check
    if check_memory:
        self._memory_guard("conversion", threshold_percent=80.0)

    try:
        if self.mode == "single":
            return self._to_dataframe(
                band_number=kwargs.get("band_number", 1),
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
                min_value=min_value,
                max_value=max_value,
            )
        else:
            return self._to_dataframe(
                band_number=None,  # All bands
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
                min_value=min_value,
                max_value=max_value,
            )
    except Exception as e:
        raise ValueError(
            f"Failed to process TIF file in mode '{self.mode}'. "
            f"Please ensure the file is valid and matches the selected mode. "
            f"Original error: {str(e)}"
        )

to_dataframe_chunked(drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs)

Convert raster to DataFrame using memory-efficient chunked processing.

Parameters:

Name Type Description Default
drop_nodata

Whether to exclude pixels with the nodata value.

True
chunk_size

Specific number of rows per chunk. If None, it is auto-calculated.

None
target_memory_mb

Target memory limit per chunk in megabytes.

500
**kwargs

Additional arguments like band_number or band_names.

{}

Returns:

Type Description

A consolidated DataFrame containing all processed chunks.

Source code in gigaspatial/processing/tif_processor.py
def to_dataframe_chunked(
    self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
):
    """
    Convert raster to DataFrame using memory-efficient chunked processing.

    Args:
        drop_nodata: Whether to exclude pixels with the nodata value.
        chunk_size: Specific number of rows per chunk. If None, it is auto-calculated.
        target_memory_mb: Target memory limit per chunk in megabytes.
        **kwargs: Additional arguments like `band_number` or `band_names`.

    Returns:
        A consolidated DataFrame containing all processed chunks.
    """

    if chunk_size is None:
        chunk_size = self._calculate_optimal_chunk_size(
            "conversion", target_memory_mb
        )

    windows = self._get_chunk_windows(chunk_size)

    # SIMPLE ROUTING
    if self.mode == "single":
        return self._to_dataframe_chunked(
            windows,
            band_number=kwargs.get("band_number", 1),
            drop_nodata=drop_nodata,
            band_names=kwargs.get("band_names", None),
        )
    else:  # rgb, rgba, multi
        return self._to_dataframe_chunked(
            windows,
            band_number=None,
            drop_nodata=drop_nodata,
            band_names=kwargs.get("band_names", None),
        )

to_geodataframe(check_memory=True, min_value=None, max_value=None, **kwargs)

Convert the raster data into a GeoDataFrame.

Each row represents a pixel, with a Point or Box geometry representing its spatial extent.

Parameters:

Name Type Description Default
check_memory

If True, checks system memory availability.

True
min_value Optional[float]

Optional minimum threshold for pixel values.

None
max_value Optional[float]

Optional maximum threshold for pixel values.

None
**kwargs

Additional arguments passed to to_dataframe.

{}

Returns:

Type Description
GeoDataFrame

A GeoDataFrame containing pixel centroids or boxes and their values.

Source code in gigaspatial/processing/tif_processor.py
def to_geodataframe(
    self,
    check_memory=True,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Convert the raster data into a GeoDataFrame.

    Each row represents a pixel, with a Point or Box geometry representing
    its spatial extent.

    Args:
        check_memory: If True, checks system memory availability.
        min_value: Optional minimum threshold for pixel values.
        max_value: Optional maximum threshold for pixel values.
        **kwargs: Additional arguments passed to `to_dataframe`.

    Returns:
        A GeoDataFrame containing pixel centroids or boxes and their values.
    """
    # Memory guard check
    if check_memory:
        self._memory_guard("conversion", threshold_percent=80.0)

    # Get filtered DataFrame - geometry creation happens AFTER filtering
    df = self.to_dataframe(
        check_memory=False, min_value=min_value, max_value=max_value, **kwargs
    )

    x_res, y_res = self.resolution

    # create bounding box for each pixel
    geometries = [
        box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
        for lon, lat in zip(df["lon"], df["lat"])
    ]

    gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
    return gdf

to_graph(connectivity=4, band=None, include_coordinates=False, graph_type='networkx', check_memory=True)

Convert the raster into a graph representation based on pixel adjacency.

Parameters:

Name Type Description Default
connectivity Literal[4, 8]

Neighborhood connectivity (4 for von Neumann, 8 for Moore).

4
band Optional[int]

Band number to use for node values (1-indexed).

None
include_coordinates bool

If True, adds 'x' and 'y' attributes to nodes.

False
graph_type Literal['networkx', 'sparse']

Output type ('networkx' for Graph object, 'sparse' for CSR matrix).

'networkx'
check_memory bool

If True, validates memory availability before processing.

True

Returns:

Type Description
Union[Graph, csr_matrix]

A NetworkX Graph or a SciPy sparse CSR matrix.

Source code in gigaspatial/processing/tif_processor.py
def to_graph(
    self,
    connectivity: Literal[4, 8] = 4,
    band: Optional[int] = None,
    include_coordinates: bool = False,
    graph_type: Literal["networkx", "sparse"] = "networkx",
    check_memory: bool = True,
) -> Union[nx.Graph, sp.csr_matrix]:
    """
    Convert the raster into a graph representation based on pixel adjacency.

    Args:
        connectivity: Neighborhood connectivity (4 for von Neumann, 8 for Moore).
        band: Band number to use for node values (1-indexed).
        include_coordinates: If True, adds 'x' and 'y' attributes to nodes.
        graph_type: Output type ('networkx' for Graph object, 'sparse' for CSR matrix).
        check_memory: If True, validates memory availability before processing.

    Returns:
        A NetworkX Graph or a SciPy sparse CSR matrix.
    """

    # Memory guard check
    if check_memory:
        self._memory_guard("graph", threshold_percent=80.0)

    with self.open_dataset() as src:
        band_idx = band - 1 if band is not None else 0
        if band_idx < 0 or band_idx >= src.count:
            raise ValueError(
                f"Band {band} not available. Raster has {src.count} bands"
            )

        data = src.read(band_idx + 1)
        nodata = src.nodata if src.nodata is not None else self.nodata
        valid_mask = (
            data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
        )

        height, width = data.shape

        # Find all valid pixels
        valid_rows, valid_cols = np.where(valid_mask)
        num_valid_pixels = len(valid_rows)

        # Create a sequential mapping from (row, col) to a node ID
        node_map = np.full(data.shape, -1, dtype=int)
        node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)

        # Define neighborhood offsets
        if connectivity == 4:
            # von Neumann neighborhood (4-connectivity)
            offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        else:  # connectivity == 8
            # Moore neighborhood (8-connectivity)
            offsets = [
                (-1, -1),
                (-1, 0),
                (-1, 1),
                (0, -1),
                (0, 1),
                (1, -1),
                (1, 0),
                (1, 1),
            ]

        # Collect nodes and edges
        nodes_to_add = []
        edges_to_add = []

        for i in range(num_valid_pixels):
            row, col = valid_rows[i], valid_cols[i]
            current_node_id = node_map[row, col]

            # Prepare node attributes
            node_attrs = {"value": float(data[row, col])}
            if include_coordinates:
                x, y = src.xy(row, col)
                node_attrs["x"] = x
                node_attrs["y"] = y
            nodes_to_add.append((current_node_id, node_attrs))

            # Find neighbors and collect edges
            for dy, dx in offsets:
                neighbor_row, neighbor_col = row + dy, col + dx

                # Check if neighbor is within bounds and is a valid pixel
                if (
                    0 <= neighbor_row < height
                    and 0 <= neighbor_col < width
                    and valid_mask[neighbor_row, neighbor_col]
                ):
                    neighbor_node_id = node_map[neighbor_row, neighbor_col]

                    # Ensure each edge is added only once
                    if current_node_id < neighbor_node_id:
                        neighbor_value = float(data[neighbor_row, neighbor_col])
                        edges_to_add.append(
                            (current_node_id, neighbor_node_id, neighbor_value)
                        )

        if graph_type == "networkx":
            G = nx.Graph()
            G.add_nodes_from(nodes_to_add)
            G.add_weighted_edges_from(edges_to_add)
            return G
        else:  # sparse matrix
            edges_array = np.array(edges_to_add)
            row_indices = edges_array[:, 0]
            col_indices = edges_array[:, 1]
            weights = edges_array[:, 2]

            # Add reverse edges for symmetric matrix
            from_idx = np.append(row_indices, col_indices)
            to_idx = np.append(col_indices, row_indices)
            weights = np.append(weights, weights)

            return sp.coo_matrix(
                (weights, (from_idx, to_idx)),
                shape=(num_valid_pixels, num_valid_pixels),
            ).tocsr()

validate_dataset_path(value)

Validates that at least one dataset path is provided.

Source code in gigaspatial/processing/tif_processor.py
@field_validator("dataset_path")
def validate_dataset_path(cls, value):
    """Validates that at least one dataset path is provided."""
    if isinstance(value, list):
        if path_len := len(value):
            if path_len == 1:
                return value[0]
            return value

        raise ValueError("No dataset paths provided.")

    if isinstance(value, (Path, str)):
        return value

add_area_in_meters(gdf, area_column_name='area_in_meters')

Calculate the area of geometries in square meters and add it as a new column.

Automatically handles UTM transformation for accurate area calculation.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing (Multi)Polygon geometries.

required
area_column_name str

Name of the new column.

'area_in_meters'

Returns:

Type Description
GeoDataFrame

The input GeoDataFrame with an additional area column.

Raises:

Type Description
ValueError

If the input GeoDataFrame contains non-polygon geometries.

Source code in gigaspatial/processing/geo.py
def add_area_in_meters(
    gdf: gpd.GeoDataFrame, area_column_name: str = "area_in_meters"
) -> gpd.GeoDataFrame:
    """
    Calculate the area of geometries in square meters and add it as a new column.

    Automatically handles UTM transformation for accurate area calculation.

    Args:
        gdf: GeoDataFrame containing (Multi)Polygon geometries.
        area_column_name: Name of the new column.

    Returns:
        The input GeoDataFrame with an additional area column.

    Raises:
        ValueError: If the input GeoDataFrame contains non-polygon geometries.
    """
    # Validate input geometries
    if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
        raise ValueError(
            "Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
        )

    # Create a copy of the GeoDataFrame to avoid modifying the original
    gdf_with_area = gdf.copy()

    # Calculate the UTM CRS for accurate area calculation
    try:
        utm_crs = gdf_with_area.estimate_utm_crs()
    except Exception as e:
        LOGGER.warning(
            f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
        )
        utm_crs = "EPSG:3857"  # Fallback to Web Mercator

    # Transform to UTM CRS and calculate the area in square meters
    gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area

    return gdf_with_area

add_spatial_jitter(df, columns=['latitude', 'longitude'], amount=0.0001, seed=None, copy=True)

Add random jitter to duplicated coordinates to separate overlapping points.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing coordinates.

required
columns List[str]

Column names to jitter.

['latitude', 'longitude']
amount float

Amount of jitter to add.

0.0001
seed

Random seed for reproducibility.

None
copy

Whether to create a copy of the input DataFrame.

True

Returns:

Type Description
DataFrame

DataFrame with jittered coordinates.

Raises:

Type Description
ValueError

If columns don't exist or amount is invalid.

TypeError

If input types are incorrect.

Source code in gigaspatial/processing/geo.py
def add_spatial_jitter(
    df: pd.DataFrame,
    columns: List[str] = ["latitude", "longitude"],
    amount: float = 0.0001,
    seed=None,
    copy=True,
) -> pd.DataFrame:
    """
    Add random jitter to duplicated coordinates to separate overlapping points.

    Args:
        df: DataFrame containing coordinates.
        columns: Column names to jitter.
        amount: Amount of jitter to add.
        seed: Random seed for reproducibility.
        copy: Whether to create a copy of the input DataFrame.

    Returns:
        DataFrame with jittered coordinates.

    Raises:
        ValueError: If columns don't exist or amount is invalid.
        TypeError: If input types are incorrect.
    """

    # Input validation
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    if not all(col in df.columns for col in columns):
        raise ValueError(f"Not all columns {columns} found in DataFrame")

    # Handle jitter amounts
    if isinstance(amount, (int, float)):
        if amount <= 0:
            raise ValueError("Jitter amount must be positive")
        jitter_amounts = {col: amount for col in columns}
    elif isinstance(amount, dict):
        if not all(col in amount for col in columns):
            raise ValueError("Must specify jitter amount for each column")
        if not all(amt > 0 for amt in amount.values()):
            raise ValueError("All jitter amounts must be positive")
        jitter_amounts = amount
    else:
        raise TypeError("amount must be a number or dictionary")

    # Create copy if requested
    df_work = df.copy() if copy else df

    # Set random seed if provided
    if seed is not None:
        np.random.seed(seed)

    try:
        # Find duplicated coordinates
        duplicate_mask = df_work.duplicated(subset=columns, keep=False)
        n_duplicates = duplicate_mask.sum()

        if n_duplicates > 0:
            # Add jitter to each column separately
            for col in columns:
                jitter = np.random.uniform(
                    low=-jitter_amounts[col],
                    high=jitter_amounts[col],
                    size=n_duplicates,
                )
                df_work.loc[duplicate_mask, col] += jitter

            # Validate results (ensure no remaining duplicates)
            if df_work.duplicated(subset=columns, keep=False).any():
                # If duplicates remain, recursively add more jitter
                df_work = add_spatial_jitter(
                    df_work,
                    columns=columns,
                    amount={col: amt * 2 for col, amt in jitter_amounts.items()},
                    seed=seed,
                    copy=False,
                )

        return df_work

    except Exception as e:
        raise RuntimeError(f"Error during jittering operation: {str(e)}")

aggregate_points_to_zones(points, zones, value_columns=None, aggregation='count', point_zone_predicate='within', zone_id_column='zone_id', output_suffix='', drop_geometry=False)

Aggregate point data to zones with flexible aggregation methods.

For zones with no overlapping points: - "count" aggregation fills missing values with 0. - All other aggregations fill missing values with np.nan.

Parameters:

Name Type Description Default
points Union[DataFrame, GeoDataFrame]

Point data to aggregate.

required
zones GeoDataFrame

Zones to aggregate points to.

required
value_columns Optional[Union[str, List[str]]]

Column(s) containing values to aggregate.

None
aggregation Union[str, Dict[str, str]]

Aggregation method(s) to use.

'count'
point_zone_predicate str

Spatial predicate (e.g., 'within', 'intersects').

'within'
zone_id_column str

Column in zones containing zone identifiers.

'zone_id'
output_suffix str

Suffix to add to output column names.

''
drop_geometry bool

Whether to drop the geometry column from output.

False

Returns:

Type Description
GeoDataFrame

Zones with aggregated point values.

Raises:

Type Description
TypeError

If zones is not a GeoDataFrame or aggregation is invalid.

ValueError

If columns are missing or metadata is inconsistent.

Source code in gigaspatial/processing/geo.py
def aggregate_points_to_zones(
    points: Union[pd.DataFrame, gpd.GeoDataFrame],
    zones: gpd.GeoDataFrame,
    value_columns: Optional[Union[str, List[str]]] = None,
    aggregation: Union[str, Dict[str, str]] = "count",
    point_zone_predicate: str = "within",
    zone_id_column: str = "zone_id",
    output_suffix: str = "",
    drop_geometry: bool = False,
) -> gpd.GeoDataFrame:
    """
    Aggregate point data to zones with flexible aggregation methods.

    For zones with no overlapping points:
    - ``"count"`` aggregation fills missing values with ``0``.
    - All other aggregations fill missing values with ``np.nan``.

    Args:
        points: Point data to aggregate.
        zones: Zones to aggregate points to.
        value_columns: Column(s) containing values to aggregate.
        aggregation: Aggregation method(s) to use.
        point_zone_predicate: Spatial predicate (e.g., 'within', 'intersects').
        zone_id_column: Column in zones containing zone identifiers.
        output_suffix: Suffix to add to output column names.
        drop_geometry: Whether to drop the geometry column from output.

    Returns:
        Zones with aggregated point values.

    Raises:
        TypeError: If zones is not a GeoDataFrame or aggregation is invalid.
        ValueError: If columns are missing or metadata is inconsistent.
    """
    # --- Input validation ---
    if not isinstance(zones, gpd.GeoDataFrame):
        raise TypeError("zones must be a GeoDataFrame")

    if zone_id_column not in zones.columns:
        raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")

    # --- Normalise points ---
    points_gdf = (
        convert_to_geodataframe(points)
        if not isinstance(points, gpd.GeoDataFrame)
        else points.copy()
    )

    # --- CRS alignment ---
    if points_gdf.crs != zones.crs:
        points_gdf = points_gdf.to_crs(zones.crs)

    # --- Normalise value_columns ---
    if isinstance(value_columns, str):
        value_columns = [value_columns]

    if value_columns is not None:
        missing_cols = [col for col in value_columns if col not in points_gdf.columns]
        if missing_cols:
            raise ValueError(f"Value columns not found in points data: {missing_cols}")

    # --- Build agg_funcs and per-output-column method lookup ---
    agg_funcs: Dict[str, str] = {}
    # Maps output column name → aggregation method for fill-value decisions
    agg_method_for_output_col: Dict[str, str] = {}

    if isinstance(aggregation, str):
        if aggregation == "count":
            agg_funcs["__count"] = "count"
        elif value_columns is not None:
            agg_funcs = {col: aggregation for col in value_columns}
            agg_method_for_output_col = {
                f"{col}{output_suffix}": aggregation for col in value_columns
            }
        else:
            raise ValueError(
                "value_columns must be specified for aggregation methods other than 'count'"
            )
    elif isinstance(aggregation, dict):
        if value_columns is None:
            raise ValueError(
                "value_columns must be specified when using a dict of aggregation methods"
            )
        missing_aggs = [col for col in value_columns if col not in aggregation]
        extra_aggs = [col for col in aggregation if col not in value_columns]
        if missing_aggs:
            raise ValueError(f"Missing aggregation methods for columns: {missing_aggs}")
        if extra_aggs:
            raise ValueError(
                f"Aggregation methods specified for columns not in value_columns: {extra_aggs}"
            )
        agg_funcs = dict(aggregation)
        agg_method_for_output_col = {
            f"{col}{output_suffix}": method for col, method in aggregation.items()
        }
    else:
        raise TypeError("aggregation must be a str or dict")

    # --- Spatial join ---
    result = zones.copy()
    joined = gpd.sjoin(points_gdf, zones, how="inner", predicate=point_zone_predicate)

    # --- Aggregation ---
    if "__count" in agg_funcs:
        counts = (
            joined.groupby(zone_id_column)
            .size()
            .reset_index(name=f"point_count{output_suffix}")
        )
        result = result.merge(counts, on=zone_id_column, how="left")
        result[f"point_count{output_suffix}"] = (
            result[f"point_count{output_suffix}"].fillna(0).astype(int)
        )
    else:
        # Drop geometry before non-count aggregations to avoid errors
        if "geometry" in joined.columns:
            joined = joined.drop(columns=["geometry"])

        aggregated = joined.groupby(zone_id_column).agg(agg_funcs).reset_index()

        # Flatten MultiIndex columns produced by some pandas agg paths
        if isinstance(aggregated.columns, pd.MultiIndex):
            aggregated.columns = [
                (
                    f"{col[0]}_{col[1]}{output_suffix}"
                    if col[0] != zone_id_column
                    else zone_id_column
                )
                for col in aggregated.columns
            ]
        else:
            # Single-level: rename value columns to include suffix
            aggregated = aggregated.rename(
                columns={
                    col: f"{col}{output_suffix}"
                    for col in aggregated.columns
                    if col != zone_id_column
                }
            )

        result = result.merge(aggregated, on=zone_id_column, how="left")

        # -------------------------------------------------------
        # Fill with 0 only for 'count', NaN for everything
        # else so zones with no overlapping points are distinguishable
        # from zones whose true aggregated value is zero.
        # -------------------------------------------------------
        for col in result.columns:
            if col in (zone_id_column, "geometry"):
                continue
            if not pd.api.types.is_numeric_dtype(result[col]):
                continue
            method = agg_method_for_output_col.get(col, "")
            fill_value = 0 if method == "count" else np.nan
            result[col] = result[col].fillna(fill_value)

    if drop_geometry:
        result = result.drop(columns=["geometry"])

    return result

aggregate_polygons_to_zones(polygons, zones, value_columns, aggregation='sum', predicate='intersects', zone_id_column='zone_id', output_suffix='', drop_geometry=False)

Aggregates polygon data to zones based on a spatial relationship.

Parameters:

Name Type Description Default
polygons Union[DataFrame, GeoDataFrame]

Polygon data to aggregate.

required
zones GeoDataFrame

Target zones.

required
value_columns Union[str, List[str]]

Column(s) in polygons with numeric values.

required
aggregation Union[str, Dict[str, str]]

Aggregation method(s) to use (str or dict).

'sum'
predicate Literal['intersects', 'within', 'fractional']

Spatial relationship ('intersects', 'within', 'fractional').

'intersects'
zone_id_column str

Unique identifier column in zones.

'zone_id'
output_suffix str

Suffix for output columns.

''
drop_geometry bool

Whether to drop the geometry column.

False

Returns:

Type Description
GeoDataFrame

The zones GeoDataFrame with aggregated values.

Raises:

Type Description
TypeError

If polygons/zones are not GeoDataFrames.

ValueError

If columns or predicates are invalid.

Source code in gigaspatial/processing/geo.py
def aggregate_polygons_to_zones(
    polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
    zones: gpd.GeoDataFrame,
    value_columns: Union[str, List[str]],
    aggregation: Union[str, Dict[str, str]] = "sum",
    predicate: Literal["intersects", "within", "fractional"] = "intersects",
    zone_id_column: str = "zone_id",
    output_suffix: str = "",
    drop_geometry: bool = False,
) -> gpd.GeoDataFrame:
    """
    Aggregates polygon data to zones based on a spatial relationship.

    Args:
        polygons: Polygon data to aggregate.
        zones: Target zones.
        value_columns: Column(s) in polygons with numeric values.
        aggregation: Aggregation method(s) to use (str or dict).
        predicate: Spatial relationship ('intersects', 'within', 'fractional').
        zone_id_column: Unique identifier column in zones.
        output_suffix: Suffix for output columns.
        drop_geometry: Whether to drop the geometry column.

    Returns:
        The zones GeoDataFrame with aggregated values.

    Raises:
        TypeError: If polygons/zones are not GeoDataFrames.
        ValueError: If columns or predicates are invalid.
    """
    # --- Input validation ---
    if not isinstance(zones, gpd.GeoDataFrame):
        raise TypeError("zones must be a GeoDataFrame")

    if zones.empty:
        raise ValueError("zones GeoDataFrame is empty")

    if zone_id_column not in zones.columns:
        raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")

    if predicate not in ["intersects", "within", "fractional"]:
        raise ValueError(
            f"Unsupported predicate: '{predicate}'. "
            "Must be one of: 'intersects', 'within', 'fractional'."
        )

    # --- Normalise polygons ---
    if not isinstance(polygons, gpd.GeoDataFrame):
        try:
            polygons_gdf = convert_to_geodataframe(polygons)
        except Exception as e:
            raise TypeError(
                f"polygons must be a GeoDataFrame or convertible to one: {e}"
            )
    else:
        polygons_gdf = polygons.copy()

    if polygons_gdf.empty:
        LOGGER.warning("Empty polygons GeoDataFrame provided")
        return zones

    # --- Geometry type validation ---
    non_polygon_geoms = [
        geom_type
        for geom_type in polygons_gdf.geometry.geom_type.unique()
        if geom_type not in ["Polygon", "MultiPolygon"]
    ]
    if non_polygon_geoms:
        raise ValueError(
            f"Input contains non-polygon geometries: {non_polygon_geoms}. "
            "Use aggregate_points_to_zones for point data."
        )

    # --- Normalise value_columns ---
    if isinstance(value_columns, str):
        value_columns = [value_columns]

    missing_cols = [col for col in value_columns if col not in polygons_gdf.columns]
    if missing_cols:
        raise ValueError(f"Value columns not found in polygons data: {missing_cols}")

    if zone_id_column in polygons_gdf.columns:
        raise ValueError(
            f"Column name conflict: polygons DataFrame contains column '{zone_id_column}' "
            "which conflicts with the zone identifier column. "
            "Please rename this column in the polygons data."
        )

    # --- CRS alignment ---
    if polygons_gdf.crs != zones.crs:
        polygons_gdf = polygons_gdf.to_crs(zones.crs)

    # --- Build aggregation functions ---
    agg_funcs = _process_aggregation_methods(aggregation, value_columns)

    # Build lookup: original col name → method (before suffix is applied)
    # Used below to decide fill value per column.
    if isinstance(aggregation, str):
        agg_method_for_col: Dict[str, str] = {col: aggregation for col in value_columns}
    else:
        agg_method_for_col = dict(aggregation)

    # --- Spatial aggregation ---
    minimal_zones = zones[[zone_id_column, "geometry"]].copy()

    if predicate == "fractional":
        aggregated_data = _fractional_aggregation(
            polygons_gdf, minimal_zones, value_columns, agg_funcs, zone_id_column
        )
    else:
        aggregated_data = _simple_aggregation(
            polygons_gdf,
            minimal_zones,
            value_columns,
            agg_funcs,
            zone_id_column,
            predicate,
        )

    # --- Merge back to full zones ---
    result = zones.merge(
        aggregated_data[[col for col in aggregated_data.columns if col != "geometry"]],
        on=zone_id_column,
        how="left",
    )

    # --- Fill NaN values: 0 for count, np.nan for everything else ---
    # NOTE: output_suffix has NOT been applied yet, so column names here
    # still match the keys in agg_method_for_col.
    aggregated_cols = [col for col in result.columns if col not in zones.columns]
    for col in aggregated_cols:
        if not pd.api.types.is_numeric_dtype(result[col]):
            continue
        method = agg_method_for_col.get(col, "")
        fill_value = 0 if method == "count" else np.nan
        result[col] = result[col].fillna(fill_value)

    # --- Apply output suffix ---
    if output_suffix:
        rename_dict = {col: f"{col}{output_suffix}" for col in aggregated_cols}
        result = result.rename(columns=rename_dict)

    if drop_geometry:
        result = result.drop(columns=["geometry"])

    return result

annotate_with_admin_regions(gdf, country_code, data_store=None, admin_id_column_suffix='')

Annotate a GeoDataFrame with administrative region information.

Performs a spatial join between the input points and administrative boundaries at levels 1 and 2, resolving conflicts when points intersect multiple regions.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing points to annotate.

required
country_code str

ISO country code for administrative boundaries.

required
data_store Optional[DataStore]

Optional DataStore for loading boundary data.

None
admin_id_column_suffix

Optional suffix for admin ID columns.

''

Returns:

Type Description
GeoDataFrame

GeoDataFrame with added administrative region columns (admin1, admin2, etc.).

Raises:

Type Description
TypeError

If gdf is not a GeoDataFrame.

Source code in gigaspatial/processing/geo.py
def annotate_with_admin_regions(
    gdf: gpd.GeoDataFrame,
    country_code: str,
    data_store: Optional[DataStore] = None,
    admin_id_column_suffix="",
) -> gpd.GeoDataFrame:
    """
    Annotate a GeoDataFrame with administrative region information.

    Performs a spatial join between the input points and administrative boundaries
    at levels 1 and 2, resolving conflicts when points intersect multiple regions.

    Args:
        gdf: GeoDataFrame containing points to annotate.
        country_code: ISO country code for administrative boundaries.
        data_store: Optional DataStore for loading boundary data.
        admin_id_column_suffix: Optional suffix for admin ID columns.

    Returns:
        GeoDataFrame with added administrative region columns (admin1, admin2, etc.).

    Raises:
        TypeError: If gdf is not a GeoDataFrame.
    """
    from gigaspatial.handlers.boundaries import AdminBoundaries

    if not isinstance(gdf, gpd.GeoDataFrame):
        raise TypeError("gdf must be a GeoDataFrame")

    if gdf.empty:
        LOGGER.warning("Empty GeoDataFrame provided, returning as-is")
        return gdf

    # read country admin data
    admin1_data = AdminBoundaries.create(
        country_code=country_code, admin_level=1, data_store=data_store
    ).to_geodataframe()

    admin1_data.rename(
        columns={"boundary_id": f"admin1_id{admin_id_column_suffix}", "name": "admin1"},
        inplace=True,
    )
    admin1_data = admin1_data[[f"admin1_id{admin_id_column_suffix}", "admin1", "geometry"]]

    admin2_data = AdminBoundaries.create(
        country_code=country_code, admin_level=2, data_store=data_store
    ).to_geodataframe()

    admin2_data.rename(
        columns={
            "boundary_id": f"admin2_id{admin_id_column_suffix}",
            "parent_id": f"admin1_id{admin_id_column_suffix}",
            "name": "admin2",
        },
        inplace=True,
    )
    admin2_data = admin2_data[
        [
            f"admin2_id{admin_id_column_suffix}",
            "admin2",
            f"admin1_id{admin_id_column_suffix}",
            "geometry"
        ]
    ]

    # Join dataframes based on 'admin1_id_giga'
    admin_data = admin2_data.merge(
        admin1_data,
        left_on=f"admin1_id{admin_id_column_suffix}",
        right_on=f"admin1_id{admin_id_column_suffix}",
        how="outer",
    )

    admin_data["geometry"] = admin_data.apply(
        lambda x: x.geometry_x if x.geometry_x else x.geometry_y, axis=1
    )

    admin_data = gpd.GeoDataFrame(
        admin_data.drop(columns=["geometry_x", "geometry_y"]),
        geometry="geometry",
        crs=4326,
    )

    # admin_data["admin2"].fillna("Unknown", inplace=True)
    admin_data[f"admin2_id{admin_id_column_suffix}"] = admin_data[
        f"admin2_id{admin_id_column_suffix}"
    ].replace({np.nan: None})

    if gdf.crs is None:
        LOGGER.warning("Input GeoDataFrame has no CRS, assuming EPSG:4326")
        gdf.set_crs(epsg=4326, inplace=True)
    elif gdf.crs != "EPSG:4326":
        LOGGER.info(f"Reprojecting from {gdf.crs} to EPSG:4326")
        gdf = gdf.to_crs(epsg=4326)

    # spatial join gdf to admins
    gdf_w_admins = gdf.copy().sjoin(
        admin_data,
        how="left",
        predicate="intersects",
    )

    # Check for duplicates caused by points intersecting multiple polygons
    if len(gdf_w_admins) != len(gdf):
        LOGGER.warning(
            "Some points intersect multiple administrative boundaries. Resolving conflicts..."
        )

        # Group by original index and select the closest admin area for ties
        gdf_w_admins["distance"] = gdf_w_admins.apply(
            lambda row: row.geometry.distance(
                admin_data.loc[row.index_right, "geometry"].centroid
            ),
            axis=1,
        )

        # For points with multiple matches, keep the closest polygon
        gdf_w_admins = gdf_w_admins.loc[
            gdf_w_admins.groupby(gdf.index)["distance"].idxmin()
        ].drop(columns="distance")

    # Drop unnecessary columns and reset the index
    gdf_w_admins = gdf_w_admins.drop(columns="index_right").reset_index(drop=True)

    return gdf_w_admins

buffer_geodataframe(gdf, buffer_distance_meters, cap_style='round', copy=True)

Buffer a GeoDataFrame with a distance in meters.

Automatically handles UTM transformation for accurate buffering.

Parameters:

Name Type Description Default
gdf GeoDataFrame

The GeoDataFrame to be buffered.

required
buffer_distance_meters Union[float, array, Series]

The buffer distance.

required
cap_style Literal['round', 'square', 'flat']

Style of caps ('round', 'square', 'flat').

'round'
copy

Whether to create a copy of the input.

True

Returns:

Type Description
GeoDataFrame

The buffered GeoDataFrame in the original CRS.

Source code in gigaspatial/processing/geo.py
def buffer_geodataframe(
    gdf: gpd.GeoDataFrame,
    buffer_distance_meters: Union[float, np.array, pd.Series],
    cap_style: Literal["round", "square", "flat"] = "round",
    copy=True,
) -> gpd.GeoDataFrame:
    """
    Buffer a GeoDataFrame with a distance in meters.

    Automatically handles UTM transformation for accurate buffering.

    Args:
        gdf: The GeoDataFrame to be buffered.
        buffer_distance_meters: The buffer distance.
        cap_style: Style of caps ('round', 'square', 'flat').
        copy: Whether to create a copy of the input.

    Returns:
        The buffered GeoDataFrame in the original CRS.
    """

    # Input validation
    if not isinstance(gdf, gpd.GeoDataFrame):
        raise TypeError("Input must be a GeoDataFrame")

    if cap_style not in ["round", "square", "flat"]:
        raise ValueError("cap_style must be round, flat or square.")

    if gdf.crs is None:
        raise ValueError("Input GeoDataFrame must have a defined CRS")

    # Create a copy if requested
    gdf_work = gdf.copy() if copy else gdf

    # Store input CRS
    input_crs = gdf_work.crs

    try:
        try:
            utm_crs = gdf_work.estimate_utm_crs()
        except Exception as e:
            LOGGER.warning(
                f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
            )
            utm_crs = "EPSG:3857"  # Fallback to Web Mercator

        # Transform to UTM, create buffer, and transform back
        gdf_work = gdf_work.to_crs(utm_crs)
        gdf_work["geometry"] = gdf_work["geometry"].buffer(
            distance=buffer_distance_meters, cap_style=cap_style
        )
        gdf_work = gdf_work.to_crs(input_crs)

        return gdf_work

    except Exception as e:
        raise RuntimeError(f"Error during buffering operation: {str(e)}")

calculate_distance(lat1, lon1, lat2, lon2, R=6371000.0)

Calculate the Haversine distance between two points.

Parameters:

Name Type Description Default
lat1

Latitude of point 1.

required
lon1

Longitude of point 1.

required
lat2

Latitude of point 2.

required
lon2

Longitude of point 2.

required
R

Earth radius in meters.

6371000.0

Returns:

Type Description

Distance in meters.

Source code in gigaspatial/processing/geo.py
def calculate_distance(lat1, lon1, lat2, lon2, R=6371e3):
    """
    Calculate the Haversine distance between two points.

    Args:
        lat1: Latitude of point 1.
        lon1: Longitude of point 1.
        lat2: Latitude of point 2.
        lon2: Longitude of point 2.
        R: Earth radius in meters.

    Returns:
        Distance in meters.
    """
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    return distance

calculate_pixels_at_location(gdf, resolution, bbox_size=300, crs='EPSG:3857')

Calculate the number of pixels required to cover a bounding box.

Calculates the dimensions in pixels for a given physical size (meters) around a coordinate, accounting for Mercator scale distortion.

Parameters:

Name Type Description Default
gdf

GeoDataFrame with Point geometries (WGS84).

required
resolution

Target resolution in meters per pixel.

required
bbox_size

Bounding box side length in meters.

300
crs

Target projection CRS.

'EPSG:3857'

Returns:

Type Description

Number of pixels per side (width and height).

Source code in gigaspatial/processing/sat_images.py
def calculate_pixels_at_location(gdf, resolution, bbox_size=300, crs="EPSG:3857"):
    """
    Calculate the number of pixels required to cover a bounding box.

    Calculates the dimensions in pixels for a given physical size (meters)
    around a coordinate, accounting for Mercator scale distortion.

    Args:
        gdf: GeoDataFrame with Point geometries (WGS84).
        resolution: Target resolution in meters per pixel.
        bbox_size: Bounding box side length in meters.
        crs: Target projection CRS.

    Returns:
        Number of pixels per side (width and height).
    """

    # Calculate avg lat and lon
    lon = gdf.geometry.x.mean()
    lat = gdf.geometry.y.mean()

    # Define projections
    wgs84 = pyproj.CRS("EPSG:4326")  # Geographic coordinate system
    mercator = pyproj.CRS(crs)  # Target CRS (EPSG:3857)

    # Transform the center coordinate to EPSG:3857
    transformer = pyproj.Transformer.from_crs(wgs84, mercator, always_xy=True)
    x, y = transformer.transform(lon, lat)

    # Calculate scale factor (distortion) at given latitude
    scale_factor = np.cos(np.radians(lat))  # Mercator scale correction

    # Adjust the effective resolution
    effective_resolution = resolution * scale_factor

    # Compute number of pixels per side
    pixels = bbox_size / effective_resolution
    return int(round(pixels))

convert_to_geodataframe(data, lat_col=None, lon_col=None, crs='EPSG:4326')

Convert a pandas DataFrame to a GeoDataFrame.

Supports conversion from latitude/longitude columns or WKT/geometry columns.

Parameters:

Name Type Description Default
data DataFrame

Input DataFrame.

required
lat_col str

Name of the latitude column.

None
lon_col str

Name of the longitude column.

None
crs

Coordinate Reference System. Defaults to 'EPSG:4326'.

'EPSG:4326'

Returns:

Type Description
GeoDataFrame

A GeoDataFrame containing the input data with a geometry column.

Raises:

Type Description
TypeError

If input is not a pandas DataFrame.

ValueError

If required columns are missing or invalid.

Source code in gigaspatial/processing/geo.py
def convert_to_geodataframe(
    data: pd.DataFrame, lat_col: str = None, lon_col: str = None, crs="EPSG:4326"
) -> gpd.GeoDataFrame:
    """
    Convert a pandas DataFrame to a GeoDataFrame.

    Supports conversion from latitude/longitude columns or WKT/geometry columns.

    Args:
        data: Input DataFrame.
        lat_col: Name of the latitude column.
        lon_col: Name of the longitude column.
        crs: Coordinate Reference System. Defaults to 'EPSG:4326'.

    Returns:
        A GeoDataFrame containing the input data with a geometry column.

    Raises:
        TypeError: If input is not a pandas DataFrame.
        ValueError: If required columns are missing or invalid.
    """

    # Input validation
    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input 'data' must be a pandas DataFrame")

    # Create a copy to avoid modifying the input
    df = data.copy()

    try:
        if "geometry" not in df.columns:
            # If column names not provided, try to detect them
            if lat_col is None or lon_col is None:
                try:
                    detected_lat, detected_lon = detect_coordinate_columns(df)
                    lat_col = lat_col or detected_lat
                    lon_col = lon_col or detected_lon
                except ValueError as e:
                    raise ValueError(
                        f"Could not automatically detect coordinate columns and no "
                        f"'geometry' column found. Error: {str(e)}"
                    )

            # Validate latitude/longitude columns exist
            if lat_col not in df.columns or lon_col not in df.columns:
                raise ValueError(
                    f"Could not find columns: {lat_col} and/or {lon_col} in the DataFrame"
                )

            # Check for missing values
            if df[lat_col].isna().any() or df[lon_col].isna().any():
                raise ValueError(
                    f"Missing values found in {lat_col} and/or {lon_col} columns"
                )

            # Create geometry from lat/lon
            geometry = gpd.points_from_xy(x=df[lon_col], y=df[lat_col])

        else:
            # Check if geometry column already contains valid geometries
            if df["geometry"].apply(lambda x: isinstance(x, base.BaseGeometry)).all():
                geometry = df["geometry"]
            elif df["geometry"].apply(lambda x: isinstance(x, str)).all():
                # Convert WKT strings to geometry objects
                geometry = df["geometry"].apply(wkt.loads)
            else:
                raise ValueError(
                    "Invalid geometry format: contains mixed or unsupported types"
                )

        # drop the WKT column if conversion was done
        if (
            "geometry" in df.columns
            and not df["geometry"]
            .apply(lambda x: isinstance(x, base.BaseGeometry))
            .all()
        ):
            df = df.drop(columns=["geometry"])

        return gpd.GeoDataFrame(df, geometry=geometry, crs=crs)

    except Exception as e:
        raise RuntimeError(f"Error converting to GeoDataFrame: {str(e)}")

detect_coordinate_columns(data, lat_keywords=None, lon_keywords=None, case_sensitive=False)

Detect latitude and longitude columns using keyword matching.

Parameters:

Name Type Description Default
data

DataFrame to search for coordinate columns.

required
lat_keywords

Keywords for identifying latitude columns.

None
lon_keywords

Keywords for identifying longitude columns.

None
case_sensitive

Whether to perform case-sensitive matching.

False

Returns:

Type Description
Tuple[str, str]

Tuple of (latitude_column_name, longitude_column_name).

Raises:

Type Description
ValueError

If no unique pair of latitude/longitude columns can be found.

TypeError

If input data is not a pandas DataFrame.

Source code in gigaspatial/processing/geo.py
def detect_coordinate_columns(
    data, lat_keywords=None, lon_keywords=None, case_sensitive=False
) -> Tuple[str, str]:
    """
    Detect latitude and longitude columns using keyword matching.

    Args:
        data: DataFrame to search for coordinate columns.
        lat_keywords: Keywords for identifying latitude columns.
        lon_keywords: Keywords for identifying longitude columns.
        case_sensitive: Whether to perform case-sensitive matching.

    Returns:
        Tuple of (latitude_column_name, longitude_column_name).

    Raises:
        ValueError: If no unique pair of latitude/longitude columns can be found.
        TypeError: If input data is not a pandas DataFrame.
    """

    # Default keywords if none provided
    default_lat = [
        "latitude",
        "lat",
        "y",
        "lat_",
        "lat(s)",
        "_lat",
        "ylat",
        "latitude_y",
    ]
    default_lon = [
        "longitude",
        "lon",
        "long",
        "x",
        "lon_",
        "lon(e)",
        "long(e)",
        "_lon",
        "xlon",
        "longitude_x",
    ]

    lat_keywords = lat_keywords or default_lat
    lon_keywords = lon_keywords or default_lon

    # Input validation
    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    if not data.columns.is_unique:
        raise ValueError("DataFrame contains duplicate column names")

    def create_pattern(keywords):
        """Create regex pattern from keywords."""
        return "|".join(rf"\b{re.escape(keyword)}\b" for keyword in keywords)

    def find_matching_columns(columns, pattern, case_sensitive) -> List:
        """Find columns matching the pattern."""
        flags = 0 if case_sensitive else re.IGNORECASE
        return [col for col in columns if re.search(pattern, col, flags=flags)]

    try:
        # Create patterns
        lat_pattern = create_pattern(lat_keywords)
        lon_pattern = create_pattern(lon_keywords)

        # Find matching columns
        lat_cols = find_matching_columns(data.columns, lat_pattern, case_sensitive)
        lon_cols = find_matching_columns(data.columns, lon_pattern, case_sensitive)

        # Remove any longitude matches from latitude columns and vice versa
        lat_cols = [col for col in lat_cols if col not in lon_cols]
        lon_cols = [col for col in lon_cols if col not in lat_cols]

        # Detailed error messages based on what was found
        if not lat_cols and not lon_cols:
            columns_list = "\n".join(f"- {col}" for col in data.columns)
            raise ValueError(
                f"No latitude or longitude columns found. Available columns are:\n{columns_list}\n"
                f"Consider adding more keywords or checking column names."
            )

        if not lat_cols:
            found_lons = ", ".join(lon_cols)
            raise ValueError(
                f"Found longitude columns ({found_lons}) but no latitude columns. "
                "Check latitude keywords or column names."
            )

        if not lon_cols:
            found_lats = ", ".join(lat_cols)
            raise ValueError(
                f"Found latitude columns ({found_lats}) but no longitude columns. "
                "Check longitude keywords or column names."
            )

        if len(lat_cols) > 1 or len(lon_cols) > 1:
            raise ValueError(
                f"Multiple possible coordinate columns found:\n"
                f"Latitude candidates: {lat_cols}\n"
                f"Longitude candidates: {lon_cols}\n"
                "Please specify more precise keywords."
            )

        return lat_cols[0], lon_cols[0]

    except Exception as e:
        if isinstance(e, ValueError):
            raise
        raise RuntimeError(f"Error detecting coordinate columns: {str(e)}")

estimate_utm_crs_with_fallback(gdf, logger=LOGGER, fallback_crs='EPSG:3857')

Robustly estimate an appropriate UTM CRS for a GeoDataFrame.

This helper wraps GeoDataFrame.estimate_utm_crs and falls back to a configurable CRS (default: Web Mercator) when estimation fails or returns None.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input GeoDataFrame used to estimate a suitable UTM CRS.

required
logger

Optional logger used to emit warnings when falling back.

LOGGER
fallback_crs str

CRS to use when UTM estimation fails. Defaults to "EPSG:3857".

'EPSG:3857'

Returns:

Type Description

A CRS object or string suitable for GeoDataFrame.to_crs.

Source code in gigaspatial/processing/geo.py
def estimate_utm_crs_with_fallback(
    gdf: gpd.GeoDataFrame,
    logger=LOGGER,
    fallback_crs: str = "EPSG:3857",
):
    """
    Robustly estimate an appropriate UTM CRS for a GeoDataFrame.

    This helper wraps ``GeoDataFrame.estimate_utm_crs`` and falls back to a
    configurable CRS (default: Web Mercator) when estimation fails or returns
    ``None``.

    Args:
        gdf: Input GeoDataFrame used to estimate a suitable UTM CRS.
        logger: Optional logger used to emit warnings when falling back.
        fallback_crs: CRS to use when UTM estimation fails. Defaults to "EPSG:3857".

    Returns:
        A CRS object or string suitable for ``GeoDataFrame.to_crs``.
    """
    if gdf is None or gdf.empty:
        if logger is not None:
            logger.warning(
                "UTM CRS estimation requested for an empty GeoDataFrame; "
                f"falling back to {fallback_crs}."
            )
        return fallback_crs

    try:
        utm_crs = gdf.estimate_utm_crs()
    except Exception as e:
        if logger is not None:
            logger.warning(
                f"UTM CRS estimation failed, using fallback CRS {fallback_crs}. "
                f"Error: {e}"
            )
        utm_crs = None

    if not utm_crs:
        if logger is not None:
            logger.warning(
                f"UTM CRS estimation returned None, using fallback CRS {fallback_crs}."
            )
        utm_crs = fallback_crs

    return utm_crs

get_centroids(gdf)

Calculate the centroids of a (Multi)Polygon GeoDataFrame.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing (Multi)Polygon geometries.

required

Returns:

Type Description
GeoDataFrame

A new GeoDataFrame with Point geometries representing the centroids.

Raises:

Type Description
ValueError

If the input GeoDataFrame contains non-polygon geometries.

Source code in gigaspatial/processing/geo.py
def get_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Calculate the centroids of a (Multi)Polygon GeoDataFrame.

    Args:
        gdf: GeoDataFrame containing (Multi)Polygon geometries.

    Returns:
        A new GeoDataFrame with Point geometries representing the centroids.

    Raises:
        ValueError: If the input GeoDataFrame contains non-polygon geometries.
    """
    # Validate input geometries
    if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
        raise ValueError(
            "Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
        )

    # Calculate centroids
    centroids = gdf.copy()
    centroids["geometry"] = centroids.geometry.centroid

    return centroids

map_points_within_polygons(base_points_gdf, polygon_gdf)

Map whether each point is within any polygon.

Parameters:

Name Type Description Default
base_points_gdf

GeoDataFrame containing points.

required
polygon_gdf

GeoDataFrame containing polygons.

required

Returns:

Type Description

The base_points_gdf with an additional is_within boolean column.

Raises:

Type Description
ValueError

If geometries are invalid or match is impossible (CRS mismatch).

Source code in gigaspatial/processing/geo.py
def map_points_within_polygons(base_points_gdf, polygon_gdf):
    """
    Map whether each point is within any polygon.

    Args:
        base_points_gdf: GeoDataFrame containing points.
        polygon_gdf: GeoDataFrame containing polygons.

    Returns:
        The `base_points_gdf` with an additional `is_within` boolean column.

    Raises:
        ValueError: If geometries are invalid or match is impossible (CRS mismatch).
    """
    # Validate input GeoDataFrames
    if not all(base_points_gdf.geometry.geom_type == "Point"):
        raise ValueError("`base_points_gdf` must contain only Point geometries.")
    if not all(polygon_gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
        raise ValueError(
            "`polygon_gdf` must contain only Polygon or MultiPolygon geometries."
        )

    if not base_points_gdf.crs == polygon_gdf.crs:
        raise ValueError("CRS of `base_points_gdf` and `polygon_gdf` must match.")

    # Perform spatial join to check if points fall within any polygon
    joined_gdf = gpd.sjoin(
        base_points_gdf, polygon_gdf[["geometry"]], how="left", predicate="within"
    )

    # Add `is_within` column to base_points_gdf
    base_points_gdf["is_within"] = base_points_gdf.index.isin(
        set(joined_gdf.index[~joined_gdf.index_right.isna()])
    )

    return base_points_gdf

simplify_geometries(gdf, tolerance=0.01, preserve_topology=True, geometry_column='geometry')

Simplify geometries to reduce file size and improve performance.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing geometries to simplify.

required
tolerance float

Tolerance for simplification.

0.01
preserve_topology bool

Whether to preserve topology.

True
geometry_column str

Name of the geometry column.

'geometry'

Returns:

Type Description
GeoDataFrame

A new GeoDataFrame with simplified geometries.

Raises:

Type Description
ValueError

If geometry column is missing or invalid.

Source code in gigaspatial/processing/geo.py
def simplify_geometries(
    gdf: gpd.GeoDataFrame,
    tolerance: float = 0.01,
    preserve_topology: bool = True,
    geometry_column: str = "geometry",
) -> gpd.GeoDataFrame:
    """
    Simplify geometries to reduce file size and improve performance.

    Args:
        gdf: GeoDataFrame containing geometries to simplify.
        tolerance: Tolerance for simplification.
        preserve_topology: Whether to preserve topology.
        geometry_column: Name of the geometry column.

    Returns:
        A new GeoDataFrame with simplified geometries.

    Raises:
        ValueError: If geometry column is missing or invalid.
    """

    # Check if the specified geometry column exists
    if geometry_column not in gdf.columns:
        raise ValueError(
            f"Geometry column '{geometry_column}' not found in the GeoDataFrame."
        )

    # Check if the specified column contains geometries
    if not gpd.GeoSeries(gdf[geometry_column]).is_valid.all():
        raise TypeError(
            f"Geometry column '{geometry_column}' contains invalid geometries."
        )

    # Simplify geometries (non-destructive)
    gdf_simplified = gdf.copy()
    gdf_simplified[geometry_column] = gdf_simplified[geometry_column].simplify(
        tolerance=tolerance, preserve_topology=preserve_topology
    )

    return gdf_simplified

algorithms

Spatial algorithms and graph-based matching. Provides optimized tools for distance-based graph construction using KD-Trees, useful for entity resolution and network analysis.

build_distance_graph(left_df, right_df, distance_threshold, max_k=100, return_dataframe=False, verbose=True, exclude_same_index=None)

Build a graph of spatial matches between two dataframes using KD-tree.

Parameters:

Name Type Description Default
left_df Union[DataFrame, GeoDataFrame]

Left dataframe to match from

required
right_df Union[DataFrame, GeoDataFrame]

Right dataframe to match to

required
distance_threshold float

Maximum distance for matching (in meters)

required
max_k int

Maximum number of neighbors to consider per point (default: 100)

100
return_dataframe bool

If True, also return the matches DataFrame

False
verbose bool

If True, print statistics about the graph

True
exclude_same_index Optional[bool]

If True, exclude self-matches. If None, auto-detect based on df equality

None

Returns:

Type Description
Union[Graph, Tuple[Graph, DataFrame]]

NetworkX Graph, or tuple of (Graph, DataFrame) if return_dataframe=True

Raises:

Type Description
ValueError

If distance_threshold is negative or max_k is not positive

Source code in gigaspatial/processing/algorithms.py
def build_distance_graph(
    left_df: Union[pd.DataFrame, gpd.GeoDataFrame],
    right_df: Union[pd.DataFrame, gpd.GeoDataFrame],
    distance_threshold: float,
    max_k: int = 100,
    return_dataframe: bool = False,
    verbose: bool = True,
    exclude_same_index: Optional[bool] = None,
) -> Union[nx.Graph, Tuple[nx.Graph, pd.DataFrame]]:
    """
    Build a graph of spatial matches between two dataframes using KD-tree.

    Args:
        left_df: Left dataframe to match from
        right_df: Right dataframe to match to
        distance_threshold: Maximum distance for matching (in meters)
        max_k: Maximum number of neighbors to consider per point (default: 100)
        return_dataframe: If True, also return the matches DataFrame
        verbose: If True, print statistics about the graph
        exclude_same_index: If True, exclude self-matches. If None, auto-detect based on df equality

    Returns:
        NetworkX Graph, or tuple of (Graph, DataFrame) if return_dataframe=True

    Raises:
        ValueError: If distance_threshold is negative or max_k is not positive
    """

    # Input validation
    if distance_threshold < 0:
        raise ValueError("distance_threshold must be non-negative")

    if max_k <= 0:
        raise ValueError("max_k must be positive")

    if left_df.empty or right_df.empty:
        if verbose:
            LOGGER.warning("Warning: One or both dataframes are empty")
        G = nx.Graph()
        return (G, pd.DataFrame()) if return_dataframe else G

    def get_utm_coordinates(df: Union[pd.DataFrame, gpd.GeoDataFrame]) -> np.ndarray:
        """
        Extract coordinates as numpy array in UTM projection.

        Args:
            df: Input DataFrame with spatial data.

        Returns:
            Numpy array of (x, y) coordinates in UTM.
        """
        if isinstance(df, pd.DataFrame) and not isinstance(df, gpd.GeoDataFrame):
            gdf = convert_to_geodataframe(df)
        else:
            gdf = df.copy()

        # More robust UTM CRS estimation with shared helper
        utm_crs = estimate_utm_crs_with_fallback(
            gdf, logger=LOGGER if verbose else None
        )
        gdf_utm = gdf.to_crs(utm_crs)

        return gdf_utm.get_coordinates().to_numpy()

    # Auto-detect same dataframe case
    if exclude_same_index is None:
        exclude_same_index = left_df.equals(right_df)
        if verbose and exclude_same_index:
            LOGGER.info("Auto-detected same dataframe - excluding self-matches")

    # Get coordinates
    left_coords = get_utm_coordinates(left_df)
    right_coords = (
        get_utm_coordinates(right_df) if not exclude_same_index else left_coords
    )

    # Build KD-tree and query
    kdtree = cKDTree(right_coords)

    # Use the provided max_k parameter, but don't exceed available points
    k_to_use = min(max_k, len(right_coords))

    if exclude_same_index:
        # Request one extra neighbor to account for self-match removal
        k_to_query = min(k_to_use + 1, len(right_coords))
    else:
        k_to_query = k_to_use

    if verbose and k_to_use < max_k:
        LOGGER.info(
            f"Note: max_k ({max_k}) reduced to {k_to_use} (number of available points)"
        )

    # Note: Distance calculations here are based on Euclidean distance in UTM projection.
    # This can introduce errors up to ~50 cm for a 50 meter threshold, especially near the poles where distortion increases.
    distances, indices = kdtree.query(
        left_coords, k=k_to_query, distance_upper_bound=distance_threshold
    )

    # Handle single k case (when k_to_use = 1, results are 1D)
    if distances.ndim == 1:
        distances = distances.reshape(-1, 1)
        indices = indices.reshape(-1, 1)

    # Extract valid pairs using vectorized operations
    left_indices = np.arange(len(distances))[:, np.newaxis]
    left_indices = np.broadcast_to(left_indices, distances.shape)
    valid_mask = np.isfinite(distances)

    if exclude_same_index:
        same_index_mask = left_indices == indices
        valid_mask = valid_mask & ~same_index_mask

    valid_left = left_indices[valid_mask]
    valid_right = indices[valid_mask]
    valid_distances = distances[valid_mask]

    # Map back to original indices
    valid_left_indices = left_df.index.values[valid_left]
    valid_right_indices = right_df.index.values[valid_right]

    # Create matches DataFrame
    matches_df = pd.DataFrame(
        {
            "left_idx": valid_left_indices,
            "right_idx": valid_right_indices,
            "distance": valid_distances,
        }
    )

    # Build graph more efficiently
    G = nx.from_pandas_edgelist(
        matches_df,
        source="left_idx",
        target="right_idx",
        edge_attr="distance",
        create_using=nx.Graph(),
    )

    # Add isolated nodes (nodes without any matches within threshold)
    # This ensures all original indices are represented in the graph
    all_left_nodes = set(left_df.index.values)
    all_right_nodes = set(right_df.index.values)

    if not exclude_same_index:
        all_nodes = all_left_nodes | all_right_nodes
    else:
        all_nodes = all_left_nodes  # Same dataframe, so same node set

    # Add nodes that don't have edges
    existing_nodes = set(G.nodes())
    isolated_nodes = all_nodes - existing_nodes
    G.add_nodes_from(isolated_nodes)

    # Print statistics
    if verbose:
        print(
            f"Total potential matches: {len(left_df)} × {len(right_df)} = {len(left_df) * len(right_df):,}"
        )
        print(f"Matches found within {distance_threshold}m: {len(matches_df):,}")
        print(f"Graph nodes: {G.number_of_nodes():,}")
        print(f"Graph edges: {G.number_of_edges():,}")

        components = list(nx.connected_components(G))
        print(f"Connected components: {len(components):,}")

        if len(components) > 1:
            component_sizes = [len(c) for c in components]
            print(f"Largest component size: {max(component_sizes):,}")
            print(
                f"Isolated nodes: {sum(1 for size in component_sizes if size == 1):,}"
            )

        if len(matches_df) > 0:
            print(
                f"Distance stats - min: {matches_df['distance'].min():.1f}m, "
                f"max: {matches_df['distance'].max():.1f}m, "
                f"mean: {matches_df['distance'].mean():.1f}m"
            )

    return (G, matches_df) if return_dataframe else G

buildings_engine

Engine for large-scale building data processing. Provides partitioned processing for Google and Microsoft building datasets, supporting zonal counts and nearest-building searches for POIs.

BuildingCountsResult dataclass

Result of counting buildings per zone.

Source code in gigaspatial/processing/buildings_engine.py
@dataclass(frozen=True)
class BuildingCountsResult:
    """Result of counting buildings per zone."""

    counts: pd.Series

GoogleMSBuildingsEngine

Shared building-processing engine used by multiple view generators.

This module intentionally contains the "heavy" logic (partitioning, job creation, per-tile scans, and accumulation) so generator classes can remain focused on view orchestration and UX.

Source code in gigaspatial/processing/buildings_engine.py
class GoogleMSBuildingsEngine:
    """
    Shared building-processing engine used by multiple view generators.

    This module intentionally contains the "heavy" logic (partitioning, job creation,
    per-tile scans, and accumulation) so generator classes can remain focused on
    view orchestration and UX.
    """

    # -----------------------------
    # Shared S2 job creation
    # -----------------------------
    @staticmethod
    def _s2_grid_gdf_from_building_files(
        building_files: Sequence[Path],
    ) -> gpd.GeoDataFrame:
        """
        Build an S2 grid GeoDataFrame with a `filepath` column from S2-tile filenames.

        Args:
            building_files: Sequence of paths to S2-tiled building files.

        Returns:
            GeoDataFrame containing S2 cell geometries and corresponding filepaths.
        """
        from gigaspatial.grid.s2 import S2Cells

        cells = {int(p.stem): p for p in building_files}
        s2_grid = S2Cells.from_cells(cells.keys())
        grid_gdf = s2_grid.to_geodataframe()
        grid_gdf["filepath"] = grid_gdf.cell_id.map(cells)
        return grid_gdf

    @classmethod
    def create_partitioned_jobs_for_zones(
        cls,
        zones_gdf: gpd.GeoDataFrame,
        building_files: Sequence[Path],
        *,
        predicate: Literal["intersects"] = "intersects",
    ) -> List[Tuple[Path, np.ndarray]]:
        """
        Create job list for partitioned building data by finding intersecting S2 cells.

        Args:
            zones_gdf: GeoDataFrame of administrative or grid zones.
            building_files: Sequence of building file paths.
            predicate: Spatial predicate for matching.

        Returns:
            List of (filepath, zone_id_array) tuples.
        """
        grid_gdf = cls._s2_grid_gdf_from_building_files(building_files)

        zone_to_cell_map = gpd.sjoin(
            zones_gdf,
            grid_gdf,
            how="inner",
            predicate=predicate,
        )

        if len(zone_to_cell_map) == 0:
            return []

        jobs: List[Tuple[Path, np.ndarray]] = []
        for filepath, group in zone_to_cell_map.groupby("filepath"):
            jobs.append((filepath, group.zone_id.values))
        return jobs

    @classmethod
    def create_partitioned_jobs_for_pois(
        cls,
        pois_gdf: gpd.GeoDataFrame,
        building_files: Sequence[Path],
        *,
        search_radius_m: float,
        predicate: Literal["intersects"] = "intersects",
    ) -> List[Tuple[Path, np.ndarray]]:
        """
        Create job list for partitioned building data by buffering POIs.

        Args:
            pois_gdf: GeoDataFrame of Points of Interest.
            building_files: Sequence of building file paths.
            search_radius_m: Buffer radius in meters.
            predicate: Spatial predicate for matching.

        Returns:
            List of (filepath, poi_id_array) tuples.
        """
        grid_gdf = cls._s2_grid_gdf_from_building_files(building_files)

        buffered_pois = buffer_geodataframe(
            pois_gdf,
            buffer_distance_meters=search_radius_m,
        )

        poi_to_cell_map = gpd.sjoin(
            buffered_pois,
            grid_gdf,
            how="inner",
            predicate=predicate,
        )

        if len(poi_to_cell_map) == 0:
            return []

        jobs: List[Tuple[Path, np.ndarray]] = []
        for filepath, group in poi_to_cell_map.groupby("filepath"):
            jobs.append((filepath, group.poi_id.values))
        return jobs

    # -----------------------------
    # Zonal: count buildings
    # -----------------------------
    @classmethod
    def count_buildings_in_zones(
        cls,
        *,
        handler,
        building_files: Sequence[Path],
        zones_gdf: gpd.GeoDataFrame,
        source_filter: SourceFilter,
        logger=None,
    ) -> BuildingCountsResult:
        """
        Count buildings intersecting each zone.

        Args:
            handler: The building data handler (e.g. MSBuildingsHandler).
            building_files: Sequence of building file paths.
            zones_gdf: GeoDataFrame with 'zone_id' and 'geometry'.
            source_filter: Filter for specific building sources ('google' or 'microsoft').
            logger: Optional logger.

        Returns:
            BuildingCountsResult containing the per-zone counts.
        """
        from shapely.strtree import STRtree

        logger = logger or LOGGER

        global_counts = pd.Series(0, index=zones_gdf.zone_id, dtype=int)

        def _iter_jobs() -> Iterable[Tuple[Path, np.ndarray, bool]]:
            if len(building_files) == 1:
                yield (building_files[0], zones_gdf.zone_id.values, True)
                return
            jobs = cls.create_partitioned_jobs_for_zones(zones_gdf, building_files)
            for fp, zone_ids in jobs:
                yield (fp, zone_ids, False)

        jobs_list = list(_iter_jobs())
        if len(building_files) > 1 and len(jobs_list) == 0:
            # Partitioned case but no intersecting tiles
            return BuildingCountsResult(counts=global_counts)

        logger.info(f"Processing {len(jobs_list)} building file(s)...")

        for filepath, zone_ids, is_single_file in jobs_list:
            try:
                columns_to_read = (
                    ["geometry"] if source_filter is None else ["bf_source", "geometry"]
                )
                buildings = handler.reader.load(filepath, columns=columns_to_read)

                if source_filter is not None and len(buildings) > 0:
                    buildings = buildings.loc[buildings["bf_source"] == source_filter]

                if len(buildings) == 0:
                    logger.debug(f"No buildings in {filepath.name} after filtering")
                    continue

                subset_zones = zones_gdf.loc[zones_gdf.zone_id.isin(zone_ids)].copy()
                subset_zones = subset_zones.reset_index(drop=True)

                tree = STRtree(buildings.geometry)
                zone_idxs, _ = tree.query(subset_zones.geometry, predicate="intersects")
                building_counts = np.bincount(zone_idxs, minlength=len(subset_zones))

                zone_id_array = subset_zones.zone_id.values
                if is_single_file:
                    global_counts.loc[zone_id_array] = building_counts
                else:
                    global_counts.loc[zone_id_array] += building_counts

                updated_zones = int((building_counts > 0).sum())
                logger.info(
                    f"Processed {filepath.name} - {updated_zones}/{len(subset_zones)} zones have buildings"
                )
            except Exception as e:
                logger.error(f"Failed to process {filepath.name}: {str(e)}")

        return BuildingCountsResult(counts=global_counts)

    # -----------------------------
    # POI: nearest building distance
    # -----------------------------
    @classmethod
    def nearest_buildings_to_pois(
        cls,
        *,
        handler,
        building_files: Sequence[Path],
        pois_gdf: gpd.GeoDataFrame,
        source_filter: SourceFilter,
        search_radius_m: float,
        logger=None,
    ) -> NearestBuildingsResult:
        """
        Find the nearest building distance (meters) per POI.

        Args:
            handler: The building data handler.
            building_files: Sequence of building file paths.
            pois_gdf: GeoDataFrame with 'poi_id' and 'geometry'.
            source_filter: Filter for building sources.
            search_radius_m: Search distance for partitioned optimization.
            logger: Optional logger.

        Returns:
            NearestBuildingsResult containing distances in meters.
        """
        from scipy.spatial import cKDTree

        logger = logger or LOGGER

        global_min_dists = pd.Series(np.inf, index=pois_gdf.poi_id, dtype=float)

        if len(building_files) == 1:
            jobs: List[Tuple[Path, np.ndarray, bool]] = [
                (building_files[0], pois_gdf.poi_id.values, True)
            ]
        else:
            jobs_list = cls.create_partitioned_jobs_for_pois(
                pois_gdf,
                building_files,
                search_radius_m=search_radius_m,
            )
            jobs = [(fp, poi_ids, False) for fp, poi_ids in jobs_list]

        if len(building_files) > 1 and len(jobs) == 0:
            return NearestBuildingsResult(distances_m=global_min_dists)

        logger.info(f"Processing {len(jobs)} building file(s)...")

        for filepath, poi_ids, is_single_file in jobs:
            try:
                columns_to_read = (
                    ["geometry"] if source_filter is None else ["bf_source", "geometry"]
                )
                buildings = handler.reader.load(filepath, columns=columns_to_read)

                if source_filter is not None and len(buildings) > 0:
                    buildings = buildings.loc[buildings["bf_source"] == source_filter]

                if len(buildings) == 0:
                    logger.debug(f"No buildings in {filepath.name} after filtering")
                    continue

                subset_pois = pois_gdf.loc[pois_gdf.poi_id.isin(poi_ids)]
                if len(subset_pois) == 0:
                    continue

                poi_coords = np.vstack(
                    (subset_pois.geometry.x, subset_pois.geometry.y)
                ).T

                b_centroids = buildings.geometry.centroid
                b_coords = np.vstack((b_centroids.x, b_centroids.y)).T
                if len(b_coords) == 0:
                    continue

                tree = cKDTree(b_coords)
                _, building_idxs = tree.query(poi_coords, k=1)

                nearest_b_coords = b_coords[building_idxs]
                distances_meters = calculate_distance(
                    lat1=poi_coords[:, 1],
                    lon1=poi_coords[:, 0],
                    lat2=nearest_b_coords[:, 1],
                    lon2=nearest_b_coords[:, 0],
                )

                if is_single_file:
                    global_min_dists.loc[poi_ids] = distances_meters
                else:
                    current_bests = global_min_dists.loc[poi_ids].values
                    improvement_mask = distances_meters < current_bests
                    if improvement_mask.any():
                        improved_poi_ids = poi_ids[improvement_mask]
                        improved_dists = distances_meters[improvement_mask]
                        global_min_dists.loc[improved_poi_ids] = improved_dists

                logger.info(f"Processed {filepath.name} - updated {len(poi_ids)} POIs")
            except Exception as e:
                logger.error(f"Failed to process {filepath.name}: {str(e)}")

        # Replace inf with NaN at the callsite (so callers can choose how to represent missing)
        return NearestBuildingsResult(distances_m=global_min_dists)
count_buildings_in_zones(*, handler, building_files, zones_gdf, source_filter, logger=None) classmethod

Count buildings intersecting each zone.

Parameters:

Name Type Description Default
handler

The building data handler (e.g. MSBuildingsHandler).

required
building_files Sequence[Path]

Sequence of building file paths.

required
zones_gdf GeoDataFrame

GeoDataFrame with 'zone_id' and 'geometry'.

required
source_filter SourceFilter

Filter for specific building sources ('google' or 'microsoft').

required
logger

Optional logger.

None

Returns:

Type Description
BuildingCountsResult

BuildingCountsResult containing the per-zone counts.

Source code in gigaspatial/processing/buildings_engine.py
@classmethod
def count_buildings_in_zones(
    cls,
    *,
    handler,
    building_files: Sequence[Path],
    zones_gdf: gpd.GeoDataFrame,
    source_filter: SourceFilter,
    logger=None,
) -> BuildingCountsResult:
    """
    Count buildings intersecting each zone.

    Args:
        handler: The building data handler (e.g. MSBuildingsHandler).
        building_files: Sequence of building file paths.
        zones_gdf: GeoDataFrame with 'zone_id' and 'geometry'.
        source_filter: Filter for specific building sources ('google' or 'microsoft').
        logger: Optional logger.

    Returns:
        BuildingCountsResult containing the per-zone counts.
    """
    from shapely.strtree import STRtree

    logger = logger or LOGGER

    global_counts = pd.Series(0, index=zones_gdf.zone_id, dtype=int)

    def _iter_jobs() -> Iterable[Tuple[Path, np.ndarray, bool]]:
        if len(building_files) == 1:
            yield (building_files[0], zones_gdf.zone_id.values, True)
            return
        jobs = cls.create_partitioned_jobs_for_zones(zones_gdf, building_files)
        for fp, zone_ids in jobs:
            yield (fp, zone_ids, False)

    jobs_list = list(_iter_jobs())
    if len(building_files) > 1 and len(jobs_list) == 0:
        # Partitioned case but no intersecting tiles
        return BuildingCountsResult(counts=global_counts)

    logger.info(f"Processing {len(jobs_list)} building file(s)...")

    for filepath, zone_ids, is_single_file in jobs_list:
        try:
            columns_to_read = (
                ["geometry"] if source_filter is None else ["bf_source", "geometry"]
            )
            buildings = handler.reader.load(filepath, columns=columns_to_read)

            if source_filter is not None and len(buildings) > 0:
                buildings = buildings.loc[buildings["bf_source"] == source_filter]

            if len(buildings) == 0:
                logger.debug(f"No buildings in {filepath.name} after filtering")
                continue

            subset_zones = zones_gdf.loc[zones_gdf.zone_id.isin(zone_ids)].copy()
            subset_zones = subset_zones.reset_index(drop=True)

            tree = STRtree(buildings.geometry)
            zone_idxs, _ = tree.query(subset_zones.geometry, predicate="intersects")
            building_counts = np.bincount(zone_idxs, minlength=len(subset_zones))

            zone_id_array = subset_zones.zone_id.values
            if is_single_file:
                global_counts.loc[zone_id_array] = building_counts
            else:
                global_counts.loc[zone_id_array] += building_counts

            updated_zones = int((building_counts > 0).sum())
            logger.info(
                f"Processed {filepath.name} - {updated_zones}/{len(subset_zones)} zones have buildings"
            )
        except Exception as e:
            logger.error(f"Failed to process {filepath.name}: {str(e)}")

    return BuildingCountsResult(counts=global_counts)
create_partitioned_jobs_for_pois(pois_gdf, building_files, *, search_radius_m, predicate='intersects') classmethod

Create job list for partitioned building data by buffering POIs.

Parameters:

Name Type Description Default
pois_gdf GeoDataFrame

GeoDataFrame of Points of Interest.

required
building_files Sequence[Path]

Sequence of building file paths.

required
search_radius_m float

Buffer radius in meters.

required
predicate Literal['intersects']

Spatial predicate for matching.

'intersects'

Returns:

Type Description
List[Tuple[Path, ndarray]]

List of (filepath, poi_id_array) tuples.

Source code in gigaspatial/processing/buildings_engine.py
@classmethod
def create_partitioned_jobs_for_pois(
    cls,
    pois_gdf: gpd.GeoDataFrame,
    building_files: Sequence[Path],
    *,
    search_radius_m: float,
    predicate: Literal["intersects"] = "intersects",
) -> List[Tuple[Path, np.ndarray]]:
    """
    Create job list for partitioned building data by buffering POIs.

    Args:
        pois_gdf: GeoDataFrame of Points of Interest.
        building_files: Sequence of building file paths.
        search_radius_m: Buffer radius in meters.
        predicate: Spatial predicate for matching.

    Returns:
        List of (filepath, poi_id_array) tuples.
    """
    grid_gdf = cls._s2_grid_gdf_from_building_files(building_files)

    buffered_pois = buffer_geodataframe(
        pois_gdf,
        buffer_distance_meters=search_radius_m,
    )

    poi_to_cell_map = gpd.sjoin(
        buffered_pois,
        grid_gdf,
        how="inner",
        predicate=predicate,
    )

    if len(poi_to_cell_map) == 0:
        return []

    jobs: List[Tuple[Path, np.ndarray]] = []
    for filepath, group in poi_to_cell_map.groupby("filepath"):
        jobs.append((filepath, group.poi_id.values))
    return jobs
create_partitioned_jobs_for_zones(zones_gdf, building_files, *, predicate='intersects') classmethod

Create job list for partitioned building data by finding intersecting S2 cells.

Parameters:

Name Type Description Default
zones_gdf GeoDataFrame

GeoDataFrame of administrative or grid zones.

required
building_files Sequence[Path]

Sequence of building file paths.

required
predicate Literal['intersects']

Spatial predicate for matching.

'intersects'

Returns:

Type Description
List[Tuple[Path, ndarray]]

List of (filepath, zone_id_array) tuples.

Source code in gigaspatial/processing/buildings_engine.py
@classmethod
def create_partitioned_jobs_for_zones(
    cls,
    zones_gdf: gpd.GeoDataFrame,
    building_files: Sequence[Path],
    *,
    predicate: Literal["intersects"] = "intersects",
) -> List[Tuple[Path, np.ndarray]]:
    """
    Create job list for partitioned building data by finding intersecting S2 cells.

    Args:
        zones_gdf: GeoDataFrame of administrative or grid zones.
        building_files: Sequence of building file paths.
        predicate: Spatial predicate for matching.

    Returns:
        List of (filepath, zone_id_array) tuples.
    """
    grid_gdf = cls._s2_grid_gdf_from_building_files(building_files)

    zone_to_cell_map = gpd.sjoin(
        zones_gdf,
        grid_gdf,
        how="inner",
        predicate=predicate,
    )

    if len(zone_to_cell_map) == 0:
        return []

    jobs: List[Tuple[Path, np.ndarray]] = []
    for filepath, group in zone_to_cell_map.groupby("filepath"):
        jobs.append((filepath, group.zone_id.values))
    return jobs
nearest_buildings_to_pois(*, handler, building_files, pois_gdf, source_filter, search_radius_m, logger=None) classmethod

Find the nearest building distance (meters) per POI.

Parameters:

Name Type Description Default
handler

The building data handler.

required
building_files Sequence[Path]

Sequence of building file paths.

required
pois_gdf GeoDataFrame

GeoDataFrame with 'poi_id' and 'geometry'.

required
source_filter SourceFilter

Filter for building sources.

required
search_radius_m float

Search distance for partitioned optimization.

required
logger

Optional logger.

None

Returns:

Type Description
NearestBuildingsResult

NearestBuildingsResult containing distances in meters.

Source code in gigaspatial/processing/buildings_engine.py
@classmethod
def nearest_buildings_to_pois(
    cls,
    *,
    handler,
    building_files: Sequence[Path],
    pois_gdf: gpd.GeoDataFrame,
    source_filter: SourceFilter,
    search_radius_m: float,
    logger=None,
) -> NearestBuildingsResult:
    """
    Find the nearest building distance (meters) per POI.

    Args:
        handler: The building data handler.
        building_files: Sequence of building file paths.
        pois_gdf: GeoDataFrame with 'poi_id' and 'geometry'.
        source_filter: Filter for building sources.
        search_radius_m: Search distance for partitioned optimization.
        logger: Optional logger.

    Returns:
        NearestBuildingsResult containing distances in meters.
    """
    from scipy.spatial import cKDTree

    logger = logger or LOGGER

    global_min_dists = pd.Series(np.inf, index=pois_gdf.poi_id, dtype=float)

    if len(building_files) == 1:
        jobs: List[Tuple[Path, np.ndarray, bool]] = [
            (building_files[0], pois_gdf.poi_id.values, True)
        ]
    else:
        jobs_list = cls.create_partitioned_jobs_for_pois(
            pois_gdf,
            building_files,
            search_radius_m=search_radius_m,
        )
        jobs = [(fp, poi_ids, False) for fp, poi_ids in jobs_list]

    if len(building_files) > 1 and len(jobs) == 0:
        return NearestBuildingsResult(distances_m=global_min_dists)

    logger.info(f"Processing {len(jobs)} building file(s)...")

    for filepath, poi_ids, is_single_file in jobs:
        try:
            columns_to_read = (
                ["geometry"] if source_filter is None else ["bf_source", "geometry"]
            )
            buildings = handler.reader.load(filepath, columns=columns_to_read)

            if source_filter is not None and len(buildings) > 0:
                buildings = buildings.loc[buildings["bf_source"] == source_filter]

            if len(buildings) == 0:
                logger.debug(f"No buildings in {filepath.name} after filtering")
                continue

            subset_pois = pois_gdf.loc[pois_gdf.poi_id.isin(poi_ids)]
            if len(subset_pois) == 0:
                continue

            poi_coords = np.vstack(
                (subset_pois.geometry.x, subset_pois.geometry.y)
            ).T

            b_centroids = buildings.geometry.centroid
            b_coords = np.vstack((b_centroids.x, b_centroids.y)).T
            if len(b_coords) == 0:
                continue

            tree = cKDTree(b_coords)
            _, building_idxs = tree.query(poi_coords, k=1)

            nearest_b_coords = b_coords[building_idxs]
            distances_meters = calculate_distance(
                lat1=poi_coords[:, 1],
                lon1=poi_coords[:, 0],
                lat2=nearest_b_coords[:, 1],
                lon2=nearest_b_coords[:, 0],
            )

            if is_single_file:
                global_min_dists.loc[poi_ids] = distances_meters
            else:
                current_bests = global_min_dists.loc[poi_ids].values
                improvement_mask = distances_meters < current_bests
                if improvement_mask.any():
                    improved_poi_ids = poi_ids[improvement_mask]
                    improved_dists = distances_meters[improvement_mask]
                    global_min_dists.loc[improved_poi_ids] = improved_dists

            logger.info(f"Processed {filepath.name} - updated {len(poi_ids)} POIs")
        except Exception as e:
            logger.error(f"Failed to process {filepath.name}: {str(e)}")

    # Replace inf with NaN at the callsite (so callers can choose how to represent missing)
    return NearestBuildingsResult(distances_m=global_min_dists)

NearestBuildingsResult dataclass

Result of nearest-building search for POIs.

Source code in gigaspatial/processing/buildings_engine.py
@dataclass(frozen=True)
class NearestBuildingsResult:
    """Result of nearest-building search for POIs."""

    distances_m: pd.Series

entity_processor

EntityProcessor

Cleans and normalises raw entity DataFrames before Pydantic validation.

Designed to operate at the Silver layer of a medallion architecture — applied after manual Bronze inspection and before Gold schema validation via EntityTable.from_file or EntityTable.from_dataframe.

Pipeline (executed in order by process):

.. code-block:: text

1.  Lowercase + strip column names
2.  Rename columns        (static aliases + coordinate auto-detection)
3.  Strip string values   (NFKC-normalised, geometry-safe)
4.  Coerce null sentinels (geometry-safe)
5.  Repair coordinates    (trailing commas, merged lat/lon)
6.  Coerce numeric cols   (pd.to_numeric, errors → NaN)
7.  Normalise enum casing (LOWERCASE_COLUMNS)
8.  Parse geometry        (WKT / WKB / Shapely pass-through)
9.  Drop all-null rows
10. Deduplicate           (geometry-safe, excludes Shapely cols)
11. Boundary filter       (optional, requires ``country`` kwarg)
12. Admin region annotation (optional, requires ``country`` kwarg)

Class attributes (override in subclasses): LOWERCASE_COLUMNS: Columns to force-lowercase for enum matching. NUMERIC_COLUMNS: Columns to coerce with pd.to_numeric. COLUMN_ALIASES: Static non-coordinate column renames applied before coordinate auto-detection. verbose: If False, INFO-level pipeline step logs are suppressed. WARNING and ERROR always surface. Defaults to True.

Silver-layer utilities (call manually, not part of process): filter_by_country_boundary — clip rows to a country polygon validate_coordinates — flag or drop invalid lat/lon rows deduplicate_by_proximity — spatial dedup via KDTree assign_entity_id — generate deterministic UUID3 identifiers

Example::

processor = CellTowerProcessor()
processor.verbose = False          # suppress pipeline INFO logs

df = pd.read_csv("bronze/ke/towers.csv")
df = processor.process(df, country="KEN")
tower_table = CellTowerTable.from_dataframe(df)
Source code in gigaspatial/processing/entity_processor.py
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
class EntityProcessor:
    """
    Cleans and normalises raw entity DataFrames before Pydantic validation.

    Designed to operate at the Silver layer of a medallion architecture —
    applied after manual Bronze inspection and before Gold schema validation
    via ``EntityTable.from_file`` or ``EntityTable.from_dataframe``.

    Pipeline (executed in order by ``process``):

    .. code-block:: text

        1.  Lowercase + strip column names
        2.  Rename columns        (static aliases + coordinate auto-detection)
        3.  Strip string values   (NFKC-normalised, geometry-safe)
        4.  Coerce null sentinels (geometry-safe)
        5.  Repair coordinates    (trailing commas, merged lat/lon)
        6.  Coerce numeric cols   (pd.to_numeric, errors → NaN)
        7.  Normalise enum casing (LOWERCASE_COLUMNS)
        8.  Parse geometry        (WKT / WKB / Shapely pass-through)
        9.  Drop all-null rows
        10. Deduplicate           (geometry-safe, excludes Shapely cols)
        11. Boundary filter       (optional, requires ``country`` kwarg)
        12. Admin region annotation (optional, requires ``country`` kwarg)

    Class attributes (override in subclasses):
        LOWERCASE_COLUMNS: Columns to force-lowercase for enum matching.
        NUMERIC_COLUMNS:   Columns to coerce with ``pd.to_numeric``.
        COLUMN_ALIASES:    Static non-coordinate column renames applied
                           before coordinate auto-detection.
        verbose:           If ``False``, INFO-level pipeline step logs are
                           suppressed. WARNING and ERROR always surface.
                           Defaults to ``True``.

    Silver-layer utilities (call manually, not part of ``process``):
        filter_by_country_boundary  — clip rows to a country polygon
        validate_coordinates        — flag or drop invalid lat/lon rows
        deduplicate_by_proximity    — spatial dedup via KDTree
        assign_entity_id            — generate deterministic UUID3 identifiers

    Example::

        processor = CellTowerProcessor()
        processor.verbose = False          # suppress pipeline INFO logs

        df = pd.read_csv("bronze/ke/towers.csv")
        df = processor.process(df, country="KEN")
        tower_table = CellTowerTable.from_dataframe(df)
    """

    LOWERCASE_COLUMNS: ClassVar[List[str]] = []
    NUMERIC_COLUMNS: ClassVar[List[str]] = ["latitude", "longitude"]
    COLUMN_ALIASES: ClassVar[Dict[str, str]] = {}
    verbose: ClassVar[bool] = True

    def __init__(self, verbose: Optional[bool] = None):
        """
        Initialise the processor.

        Args:
            verbose: Override the class-level ``verbose`` setting.
        """
        if verbose is not None:
            self.verbose = verbose
        self.processing_logs: List[str] = []

    # ------------------------------------------------------------------
    # Internal logging helper
    # ------------------------------------------------------------------

    def _log(self, level: str, msg: str, *args) -> None:
        """
        Emit a log message, respecting the ``verbose`` flag.

        INFO-level messages are suppressed when ``verbose=False``.
        WARNING and ERROR messages are always emitted.

        Args:
            level: One of ``'debug'``, ``'info'``, ``'warning'``, ``'error'``.
            msg: Printf-style message string.
            *args: Arguments for printf-style message formatting.
        """
        if not self.verbose and level == "info":
            return

        formatted_msg = msg % args if args else msg
        if hasattr(self, "processing_logs"):
            self.processing_logs.append(f"[{level.upper()}] {formatted_msg}")

        getattr(logger, level)(msg, *args)

    def track_changes(func):
        """
        Decorator for EntityProcessor methods to automatically log shape changes.

        Calculates the difference in rows and columns before and after the
        decorated method runs, appending a summary to ``self.processing_logs``.
        """

        @wraps(func)
        def wrapper(self, df: pd.DataFrame, *args, **kwargs):
            if not hasattr(self, "processing_logs"):
                return func(self, df, *args, **kwargs)

            initial_rows, initial_cols = df.shape
            initial_col_names = set(df.columns)

            result_df = func(self, df, *args, **kwargs)

            final_rows, final_cols = result_df.shape
            final_col_names = set(result_df.columns)

            dropped_rows = initial_rows - final_rows
            added_cols = final_col_names - initial_col_names
            dropped_cols = initial_col_names - final_col_names

            changes = []
            if dropped_rows > 0:
                changes.append(f"dropped {dropped_rows} rows")
            elif dropped_rows < 0:
                changes.append(f"added {abs(dropped_rows)} rows")

            if added_cols:
                changes.append(f"added columns {sorted(list(added_cols))}")
            if dropped_cols:
                changes.append(f"dropped columns {sorted(list(dropped_cols))}")

            if changes:
                msg = f"STEP [{func.__name__.strip('_')}]: " + ", ".join(changes)
                self.processing_logs.append(msg)

            return result_df

        return wrapper

    # ------------------------------------------------------------------
    # Public entry point
    # ------------------------------------------------------------------

    def process(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
        """
        Run the full cleaning pipeline on a raw DataFrame.

        Steps are executed in a fixed order designed to avoid order-dependent
        bugs (e.g. numeric coercion must follow string stripping; coordinate
        repair must precede numeric coercion).

        Args:
            df: Raw input data. May be a plain ``pd.DataFrame`` or a
                ``gpd.GeoDataFrame``; geometry columns are preserved throughout.
            **kwargs:
                country (str): Optional country identifier.  When provided:
                    - Rows outside the country boundary are removed via
                      ``filter_by_country_boundary``.
                    - Admin region columns are appended via
                      ``_annotate_with_admin_regions``.
                Additional kwargs are forwarded to both methods.

        Returns:
            Cleaned ``pd.DataFrame`` ready for Pydantic entity validation.
        """
        self.processing_logs = []  # Clear logs at start of process
        df = df.copy()

        # --- structural ---
        df = self._rename_columns(df)
        df.columns = [c.strip().lower() for c in df.columns]

        # --- string normalisation (geometry-safe, before numeric coercion) ---
        df = self._strip_strings(df)

        # --- null coercion (geometry-safe, on stripped values) ---
        df = self._coerce_nulls(df)

        # --- coordinate repair ---
        df = self._repair_coordinate_columns(df)

        # --- numeric coercion ---
        df = self._coerce_numeric_columns(df)

        # --- casing normalisation ---
        df = self._normalize_casing(df)

        # --- geometry parsing ---
        df = self._parse_geometry(df)

        # --- row cleanup ---
        df = self._drop_empty_rows(df)
        df = self._drop_duplicates(df)

        # --- enrichment ---
        country = kwargs.pop("country", None)
        if country:
            df = self._filter_by_country_boundary(df, boundary=country, **kwargs)
            df = self._annotate_with_admin_regions(df, country, **kwargs)

        # --- final normalization for Pydantic (NaN -> None) ---
        # This allows optional numeric fields with missing values to pass validation.
        df = df.replace({np.nan: None})

        return df

    # ------------------------------------------------------------------
    # Geometry column detection
    # ------------------------------------------------------------------

    @staticmethod
    def _get_geometry_cols(df: pd.DataFrame) -> set[str]:
        """
        Return column names that contain Shapely geometry objects.

        Detection strategy:

        1. If ``df`` is a ``GeoDataFrame``, include its declared geometry
           column (lowercased to match the column-normalisation step).
        2. For every remaining ``object``-dtype column, inspect the first
           non-null value; if it is a ``BaseGeometry`` instance, include it.

        This two-pass approach ensures both explicitly declared geometry
        columns and ad-hoc geometry-like columns are excluded from string
        and null-coercion operations.

        Args:
            df: DataFrame or GeoDataFrame to inspect.

        Returns:
            Set of lowercased column name strings identified as geometry.
        """
        from shapely.geometry.base import BaseGeometry

        geometry_cols: set[str] = set()

        if isinstance(df, gpd.GeoDataFrame):
            geometry_cols.add(df.geometry.name.strip().lower())

        for col in df.select_dtypes(include="object").columns:
            if col in geometry_cols:
                continue
            first_valid = df[col].dropna().iloc[:1]
            if not first_valid.empty and isinstance(first_valid.iloc[0], BaseGeometry):
                geometry_cols.add(col)

        return geometry_cols

    # ------------------------------------------------------------------
    # Pipeline steps
    # ------------------------------------------------------------------

    @track_changes
    def _rename_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Rename columns to canonical names in two passes.

        Pass 1 — static aliases:
            Applies ``COLUMN_ALIASES`` defined on the processor subclass.
            A source column is only renamed if the target name does not
            already exist, preventing accidental overwrites.

        Pass 2 — coordinate auto-detection:
            Calls ``detect_coordinate_columns`` to identify latitude /
            longitude columns by keyword matching when the canonical names
            ``'latitude'`` and ``'longitude'`` are not already present.
            Silently skipped for non-point entities (e.g. ``GigaGeoEntity``
            subclasses) that have no coordinate columns.

        Args:
            df: DataFrame after column lowercasing.

        Returns:
            DataFrame with columns renamed to canonical names.
        """
        self._log("info", "Renaming columns.")
        rename_map: Dict[str, str] = {}

        for src, tgt in self.COLUMN_ALIASES.items():
            if src in df.columns and tgt not in df.columns:
                rename_map[src] = tgt

        has_lat = "latitude" in df.columns
        has_lon = "longitude" in df.columns
        if not (has_lat and has_lon):
            try:
                lat_col, lon_col = detect_coordinate_columns(df)
                if not has_lat and lat_col != "latitude":
                    rename_map[lat_col] = "latitude"
                if not has_lon and lon_col != "longitude":
                    rename_map[lon_col] = "longitude"
            except ValueError as exc:
                logger.debug("Coordinate column detection skipped: %s", exc)

        if rename_map:
            logger.debug("Renaming columns: %s", rename_map)
            df = df.rename(columns=rename_map)

        return df

    @track_changes
    def _strip_strings(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        NFKC-normalise and strip all string values in non-geometry columns.

        NFKC normalisation is applied before stripping to convert Unicode
        compatibility characters to their canonical ASCII equivalents:

        - Non-breaking spaces (``\\xa0``) → regular space
        - Thin spaces (``\\u2009``) → regular space
        - Fullwidth digits (``\\uff10``–``\\uff19``) → ASCII digits

        Geometry columns — both GeoDataFrame-declared and ad-hoc Shapely
        object columns — are excluded to prevent silent data corruption.

        Args:
            df: DataFrame after column renaming.

        Returns:
            DataFrame with string values normalised and stripped.
        """
        self._log("info", "Stripping whitespace from string columns.")
        geometry_cols = self._get_geometry_cols(df)
        str_cols = [
            col
            for col in df.select_dtypes(include="object").columns
            if col not in geometry_cols
        ]
        if not str_cols:
            return df

        def normalize_and_strip(v):
            if not isinstance(v, str):
                return v
            return unicodedata.normalize("NFKC", v).strip()

        df[str_cols] = df[str_cols].apply(lambda col: col.apply(normalize_and_strip))
        return df

    @track_changes
    def _coerce_nulls(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Replace null-like sentinel strings with ``None`` across non-geometry columns.

        Uses ``NULL_LIKE_VALUES`` — a module-level list covering common
        representations such as ``"none"``, ``"n/a"``, ``"nan"``, ``"-"``,
        ``"#N/A"``, and ``""`` (see module constants for the full list).

        Geometry columns are skipped because ``DataFrame.replace`` iterates
        over values for equality and Shapely objects are unhashable.

        Args:
            df: DataFrame after string stripping.

        Returns:
            DataFrame with null-like values replaced by ``None``.
        """
        self._log("info", "Coercing null-like values to None.")
        geometry_cols = self._get_geometry_cols(df)
        cols_to_coerce = [c for c in df.columns if c not in geometry_cols]
        df[cols_to_coerce] = df[cols_to_coerce].replace(NULL_LIKE_VALUES, None)
        return df

    @track_changes
    def _repair_coordinate_columns(
        self,
        df: pd.DataFrame,
        lat_col: str = "latitude",
        lon_col: str = "longitude",
    ) -> pd.DataFrame:
        """
        Repair common coordinate encoding errors before numeric coercion.

        Handles four cases found in real-world Bronze-layer data:

        - Whitespace padding:  ``'-16.5275 '``  → ``-16.5275``
        - Trailing comma:      ``'-13.457503,'`` → ``-13.457503``
        - Comma-merged pair:   ``'-13.457503, 25.326'`` →
          latitude ``-13.457503``, longitude ``25.326``
        - Space-merged pair:   ``'-12.325 25.326'`` →
          latitude ``-12.325``, longitude ``25.326``

        For merged pairs, the extracted longitude fills missing values in
        the longitude column only; existing non-null longitude values are
        never overwritten.

        Returns the DataFrame unchanged if ``lat_col`` is absent.

        Args:
            df: DataFrame after null coercion.
            lat_col: Name of the latitude column. Defaults to ``'latitude'``.
            lon_col: Name of the longitude column. Defaults to ``'longitude'``.

        Returns:
            DataFrame with coordinate columns repaired.
        """
        self._log("info", "Repairing coordinate columns.")
        if lat_col not in df.columns:
            return df

        def split_merged(v):
            if not isinstance(v, str):
                return v, None
            normalized = v.strip().replace(", ", " ").replace(",", " ")
            parts = normalized.split()
            if len(parts) == 2:
                try:
                    return float(parts[0]), float(parts[1])
                except ValueError:
                    pass
            try:
                return float(v.strip().strip(",").strip()), None
            except ValueError:
                return v, None

        split_results = df[lat_col].apply(split_merged)
        extracted_lats = split_results.apply(lambda x: x[0])
        extracted_lons = split_results.apply(lambda x: x[1])
        df[lat_col] = extracted_lats

        if lon_col in df.columns:
            lon_fill_mask = df[lon_col].isna() & extracted_lons.notna()
            if lon_fill_mask.any():
                df.loc[lon_fill_mask, lon_col] = extracted_lons[lon_fill_mask]
                self._log(
                    "info",
                    "%d longitude values recovered from merged lat/lon column.",
                    lon_fill_mask.sum(),
                )
        else:
            df[lon_col] = extracted_lons

        return df

    @track_changes
    def _coerce_numeric_columns(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Coerce ``NUMERIC_COLUMNS`` to float via ``pd.to_numeric``.

        Values that cannot be parsed are silently set to ``NaN`` rather than
        raising, consistent with the soft-fail philosophy of the pipeline.
        Columns absent from ``df`` are skipped without warning.

        Args:
            df: DataFrame after coordinate repair.

        Returns:
            DataFrame with listed columns cast to float where possible.
        """
        self._log("info", "Coercing numeric columns: %s", self.NUMERIC_COLUMNS)
        for col in self.NUMERIC_COLUMNS:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")
        return df

    @track_changes
    def _normalize_casing(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Lowercase values in ``LOWERCASE_COLUMNS`` for consistent enum matching.

        Only applied to columns with ``object`` dtype; columns that have
        already been coerced to a numeric type are silently skipped.

        Args:
            df: DataFrame after numeric coercion.

        Returns:
            DataFrame with target columns lowercased.
        """
        self._log("info", "Normalising casing for columns: %s", self.LOWERCASE_COLUMNS)
        for col in self.LOWERCASE_COLUMNS:
            if col in df.columns and df[col].dtype == object:
                df[col] = df[col].str.lower()
        return df

    @track_changes
    def _parse_geometry(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Parse the ``geometry`` column from WKT strings, WKB bytes, or
        existing Shapely objects (pass-through).

        Unparseable values are set to ``None`` and logged as warnings; rows
        are **not** dropped here — callers (``from_dataframe``) decide how
        to handle missing geometry via Pydantic validation failures.

        No-op if no ``geometry`` column is present in ``df``.

        Args:
            df: DataFrame after casing normalisation.

        Returns:
            DataFrame with ``geometry`` values as Shapely objects or ``None``.
        """
        self._log("info", "Parsing geometry column.")
        if "geometry" not in df.columns:
            logger.debug("No 'geometry' column found; skipping.")
            return df

        from shapely import wkt, wkb
        from shapely.geometry.base import BaseGeometry

        def _parse(value):
            if value is None:
                return None
            if isinstance(value, BaseGeometry):
                return value
            try:
                if isinstance(value, str):
                    return wkt.loads(value)
                if isinstance(value, (bytes, bytearray)):
                    return wkb.loads(value)
            except Exception as exc:
                logger.debug("Failed to parse geometry value %r: %s", value, exc)
            return None

        df["geometry"] = df["geometry"].apply(_parse)
        null_count = df["geometry"].isna().sum()
        if null_count:
            logger.warning("%d rows have unparseable or missing geometry.", null_count)
        return df

    @track_changes
    def _drop_empty_rows(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Drop rows where every value is ``None`` / ``NaN``.

        Applied after all coercion steps so that rows reduced to all-null
        by cleaning (e.g. a row containing only ``"N/A"`` values) are removed
        before deduplication and validation.

        Args:
            df: DataFrame after geometry parsing.

        Returns:
            DataFrame with all-null rows removed.
        """
        self._log("info", "Dropping all-null rows.")
        return df.dropna(how="all")

    @track_changes
    def _drop_duplicates(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Deduplicate rows, excluding geometry and other unhashable columns.

        Shapely geometry objects and collections (sets, lists, dicts) are
        unhashable and raise ``TypeError`` when passed to ``drop_duplicates``.
        The comparison subset is therefore all non-geometry columns whose
        values are hashable.

        If no hashable columns are found, the DataFrame is returned unchanged
        to avoid calling ``drop_duplicates`` with unhashable columns.

        Args:
            df: DataFrame after empty-row removal.

        Returns:
            Deduplicated DataFrame.
        """
        self._log("info", "Dropping duplicate rows.")
        geometry_cols = self._get_geometry_cols(df)

        subset = []
        for col in df.columns:
            if col in geometry_cols:
                continue

            # Check for unhashable types in object columns
            if df[col].dtype == object:
                first_valid = df[col].dropna().iloc[:1]
                if not first_valid.empty:
                    try:
                        hash(first_valid.iloc[0])
                    except TypeError:
                        logger.debug(
                            "Excluding unhashable column '%s' from deduplication.", col
                        )
                        continue

            subset.append(col)

        if not subset:
            logger.debug(
                "No comparable columns found for deduplication; returning unchanged."
            )
            return df

        return df.drop_duplicates(subset=subset)

    def _normalize_enum_column(
        self,
        df: pd.DataFrame,
        column: str,
        alias_map: Dict[str, str],
        valid_values: Set[str],
        required: bool = False,
    ) -> pd.DataFrame:
        """
        Normalise an enum-typed column via an alias map, setting unrecognised
        values to ``None``.

        Alias resolution is applied first, then any value not present in
        ``valid_values`` and not already ``None`` is nulled and logged as a
        warning. Values that are already ``None`` / ``NaN`` are left unchanged.

        Args:
            df: DataFrame containing the column to normalise.
            column: Name of the column to normalise.
            alias_map: Mapping of raw string value → canonical enum value.
                Applied via ``Series.replace`` after casing normalisation.
            valid_values: Complete set of accepted canonical string values.
                Typically ``{e.value for e in SomeEnum}``.
            required: If ``True``, log a ``WARNING`` when the column is absent;
                otherwise log at ``DEBUG`` level. Use ``True`` for columns
                that map to required Pydantic fields. Defaults to ``False``.

        Returns:
            DataFrame with the column normalised in-place.
        """
        if column not in df.columns:
            if required:
                logger.warning("Column '%s' not found, skipping normalisation.", column)
            else:
                logger.debug("Column '%s' not found, skipping normalisation.", column)
            return df

        df[column] = df[column].replace(alias_map)

        invalid_mask = ~df[column].isin(valid_values) & df[column].notna()
        if invalid_mask.any():
            logger.warning(
                "%d rows have unrecognised '%s' values: %s — setting to None.",
                invalid_mask.sum(),
                column,
                df.loc[invalid_mask, column].unique().tolist(),
            )
            df.loc[invalid_mask, column] = None

        return df

    def _annotate_with_admin_regions(
        self,
        df: pd.DataFrame,
        country,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Spatially annotate rows with administrative region labels.

        Converts ``df`` to a ``GeoDataFrame`` if it is not already one,
        performs the spatial join via ``annotate_with_admin_regions``, then
        drops the geometry column so the result is a plain ``DataFrame``
        consistent with the rest of the pipeline.

        Called automatically by ``process`` when a ``country`` kwarg is
        provided. Can also be called manually at the Silver layer for
        incremental enrichment.

        Args:
            df: DataFrame with ``latitude`` / ``longitude`` columns or an
                existing ``geometry`` column.
            country: Country identifier forwarded to
                ``annotate_with_admin_regions`` (ISO 3166-1 alpha-3 or
                country name depending on the annotator implementation).
            **kwargs: Additional keyword arguments forwarded to the annotator.

        Returns:
            Plain ``pd.DataFrame`` with admin-region columns appended and
            geometry column dropped.
        """
        self._log("info", "Annotating rows with administrative regions.")
        if not isinstance(df, gpd.GeoDataFrame):
            df = convert_to_geodataframe(df)
        df = annotate_with_admin_regions(df, country, **kwargs)
        return df.drop(columns="geometry")

    @track_changes
    def _filter_by_country_boundary(
        self,
        df: Union[pd.DataFrame, gpd.GeoDataFrame],
        boundary: Union[str, gpd.GeoDataFrame],
        lat_col: str = "latitude",
        lon_col: str = "longitude",
        **kwargs,
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Filter rows to those spatially intersecting a country or administrative boundary.

        Rows whose geometry falls outside the dissolved boundary are removed.
        The boundary is dissolved into a single unified geometry before filtering
        to correctly handle multi-polygon and multi-feature country representations.

        For point-based DataFrames (``pd.DataFrame`` with lat/lon columns),
        the input is temporarily converted to a GeoDataFrame for the spatial
        filter then the original DataFrame type is returned.  For GeoDataFrames
        with polygon or multipolygon geometry (e.g. ``MobileCoverage``,
        ``AdminBoundary``), the existing geometry column is used directly and a
        ``GeoDataFrame`` is returned.

        ``intersects`` is used as the spatial predicate rather than ``within``
        so that polygon geometries straddling the boundary edge are retained
        rather than silently dropped.  For point geometries the two predicates
        are equivalent.

        Called automatically by ``process`` when a ``country`` kwarg is
        provided. Can also be called manually at the Silver layer for
        targeted boundary clipping.

        Args:
            df: Input DataFrame or GeoDataFrame containing entity data.
                - ``pd.DataFrame``: must contain ``lat_col`` and ``lon_col``.
                - ``gpd.GeoDataFrame``: existing geometry column is used directly;
                ``lat_col`` and ``lon_col`` are ignored.
            boundary: Boundary to filter against. Accepts:

                - ``str`` — ISO 3166-1 alpha-3 country code or country name,
                passed to ``AdminBoundaries.create`` to load the boundary
                from an open source or configured data store at admin level 0.
                - ``gpd.GeoDataFrame`` — pre-loaded boundary geometries,
                dissolved into a single union before filtering.

            lat_col: Name of the latitude column. Used only when ``df`` is a
                plain ``pd.DataFrame``. Defaults to ``'latitude'``.
            lon_col: Name of the longitude column. Used only when ``df`` is a
                plain ``pd.DataFrame``. Defaults to ``'longitude'``.
            **kwargs: Additional keyword arguments forwarded to
                ``AdminBoundaries.create`` when ``boundary`` is a string
                (e.g. ``data_store``, ``source``).

        Returns:
            - ``pd.DataFrame`` if the input was a plain DataFrame, index reset.
            - ``gpd.GeoDataFrame`` if the input was already a GeoDataFrame,
            index reset.

        Raises:
            ValueError: If ``boundary`` is not a ``str`` or ``gpd.GeoDataFrame``.
        """
        from gigaspatial.handlers.boundaries import AdminBoundaries

        # --- resolve boundary ---
        if isinstance(boundary, str):
            boundary_table = AdminBoundaries.create(boundary, admin_level=0, **kwargs)
            boundary_gdf = boundary_table.to_geodataframe()
        elif isinstance(boundary, gpd.GeoDataFrame):
            boundary_gdf = boundary
        else:
            raise ValueError(
                f"Unsupported boundary type: {type(boundary).__name__}. "
                "Expected a country string or GeoDataFrame."
            )

        boundary_union = boundary_gdf.dissolve().geometry.iloc[0]

        # --- resolve input geometry ---
        is_geodataframe = isinstance(df, gpd.GeoDataFrame)
        if is_geodataframe:
            # polygon/multipolygon entities — use existing geometry directly
            gdf = df
        else:
            # point entities — convert temporarily for spatial filter
            gdf = convert_to_geodataframe(df, lat_col=lat_col, lon_col=lon_col)

        # --- spatial filter (intersects handles both points and polygons) ---
        mask = gdf.geometry.intersects(boundary_union)

        removed = (~mask).sum()
        if removed:
            self._log("info", "%d rows removed outside the boundary.", removed)

        result = gdf.loc[mask].reset_index(drop=True)

        # return original type — drop temporary geometry for plain DataFrames
        if not is_geodataframe:
            return df.loc[mask].reset_index(drop=True)
        return result

    def validate_coordinates(
        self,
        df: pd.DataFrame,
        lat_col: str = "latitude",
        lon_col: str = "longitude",
        drop_invalid: bool = True,
    ) -> pd.DataFrame:
        """
        Validate coordinate columns, flagging or dropping invalid rows.

        Three checks are performed independently so the log output
        distinguishes between different classes of data quality issue:

        - **Missing**: ``None`` or ``NaN`` in either coordinate column.
        - **Out-of-range**: latitude outside ``[-90, 90]`` or longitude
          outside ``[-180, 180]``.
        - **Null island**: both coordinates exactly ``0.0``, which is
          unambiguously invalid for real-world infrastructure entities.

        ``pd.to_numeric`` is applied before comparison so the method is
        safe to call on columns that have not yet been through
        ``_coerce_numeric_columns`` (e.g. during manual Silver inspection).

        Args:
            df: Input DataFrame containing coordinate columns.
            lat_col: Name of the latitude column. Defaults to ``'latitude'``.
            lon_col: Name of the longitude column. Defaults to ``'longitude'``.
            drop_invalid: If ``True`` (default), invalid rows are removed and
                a summary is logged. If ``False``, an ``is_valid_coordinate``
                boolean column is appended instead, allowing the caller to
                inspect and handle invalid rows manually before dropping.

        Returns:
            - ``drop_invalid=True``: DataFrame with invalid rows removed,
              index reset.
            - ``drop_invalid=False``: Original DataFrame with an appended
              ``is_valid_coordinate`` boolean column.

        Raises:
            ValueError: If ``lat_col`` or ``lon_col`` are not present in ``df``.

        Example::

            # inspect before dropping
            df = processor.validate_coordinates(df, drop_invalid=False)
            print(df[~df["is_valid_coordinate"]][["latitude", "longitude"]])

            # drop once satisfied
            df = df[df["is_valid_coordinate"]].drop(columns="is_valid_coordinate")
        """
        missing_cols = [c for c in (lat_col, lon_col) if c not in df.columns]
        if missing_cols:
            raise ValueError(
                f"Coordinate column(s) not found in DataFrame: {missing_cols}. "
                f"Available columns: {df.columns.tolist()}"
            )

        lat = pd.to_numeric(df[lat_col], errors="coerce")
        lon = pd.to_numeric(df[lon_col], errors="coerce")

        null_mask = lat.isna() | lon.isna()
        range_mask = (lat < -90) | (lat > 90) | (lon < -180) | (lon > 180)
        null_island_mask = (lat == 0.0) & (lon == 0.0)
        invalid_mask = null_mask | range_mask | null_island_mask
        valid_mask = ~invalid_mask

        if null_mask.any():
            logger.warning("%d rows have missing coordinate values.", null_mask.sum())
        if range_mask.any():
            logger.warning(
                "%d rows have out-of-range coordinate values "
                "(lat outside [-90, 90] or lon outside [-180, 180]).",
                range_mask.sum(),
            )
        if null_island_mask.any():
            logger.warning(
                "%d rows have null island coordinates (0.0, 0.0).",
                null_island_mask.sum(),
            )

        total_invalid = invalid_mask.sum()
        if drop_invalid:
            if total_invalid:
                self._log(
                    "info",
                    "%d of %d rows removed due to invalid coordinates.",
                    total_invalid,
                    len(df),
                )
            return df.loc[valid_mask].reset_index(drop=True)
        else:
            df = df.copy()
            df["is_valid_coordinate"] = valid_mask
            self._log(
                "info",
                "%d of %d rows flagged as invalid coordinates.",
                total_invalid,
                len(df),
            )
            return df

    def deduplicate_by_proximity(
        self,
        df: pd.DataFrame,
        distance_threshold_m: float = 50,
        lat_col: str = "latitude",
        lon_col: str = "longitude",
        keep: str = "first",
    ) -> pd.DataFrame:
        """
        Remove near-duplicate rows within a spatial distance threshold.

        Uses a greedy forward scan over a KDTree built in UTM projection:
        each row is kept and all neighbours within ``distance_threshold_m``
        metres are marked as duplicates.  This is significantly faster than
        an all-pairs approach for large datasets and produces deterministic
        results given a fixed row order.

        Designed for cross-source deduplication at the Silver layer where
        the same physical site may appear in multiple source files with
        different identifiers.  For ID-based deduplication at the Gold layer
        use ``EntityTable.merge(deduplicate_by_id=True)`` instead.

        Args:
            df: Input DataFrame containing coordinate columns.
            distance_threshold_m: Maximum distance in metres below which two
                rows are considered duplicates. Defaults to ``50``.
            lat_col: Name of the latitude column. Defaults to ``'latitude'``.
            lon_col: Name of the longitude column. Defaults to ``'longitude'``.
            keep: Which occurrence to retain when duplicates are found.
                ``'first'`` retains the earliest row (default);
                ``'last'`` retains the latest row.

        Returns:
            DataFrame with near-duplicate rows removed, index reset.

        Raises:
            ValueError: If ``distance_threshold_m`` is negative.
            ValueError: If ``keep`` is not ``'first'`` or ``'last'``.
            ValueError: If ``lat_col`` or ``lon_col`` are absent from ``df``.
        """
        if distance_threshold_m < 0:
            raise ValueError("distance_threshold_m must be non-negative.")
        if keep not in ("first", "last"):
            raise ValueError(f"keep must be 'first' or 'last', got '{keep}'.")

        missing_cols = [c for c in (lat_col, lon_col) if c not in df.columns]
        if missing_cols:
            raise ValueError(
                f"Coordinate column(s) not found in DataFrame: {missing_cols}. "
                f"Available columns: {df.columns.tolist()}"
            )

        if df.empty:
            return df.reset_index(drop=True)

        working_df = (
            df.iloc[::-1].reset_index(drop=True)
            if keep == "last"
            else df.reset_index(drop=True)
        )

        gdf = convert_to_geodataframe(working_df, lat_col=lat_col, lon_col=lon_col)

        from gigaspatial.processing.geo import estimate_utm_crs_with_fallback

        utm_crs = estimate_utm_crs_with_fallback(gdf)
        coords = gdf.to_crs(utm_crs).get_coordinates().to_numpy()

        tree = cKDTree(coords)
        kept_mask = np.ones(len(coords), dtype=bool)
        for i in range(len(coords)):
            if not kept_mask[i]:
                continue
            for j in tree.query_ball_point(coords[i], r=distance_threshold_m):
                if j != i:
                    kept_mask[j] = False

        removed = (~kept_mask).sum()
        if removed:
            self._log(
                "info",
                "%d of %d rows removed as near-duplicates "
                "(distance_threshold_m=%.1f).",
                removed,
                len(df),
                distance_threshold_m,
            )

        result = working_df.loc[kept_mask].reset_index(drop=True)
        if keep == "last":
            result = result.iloc[::-1].reset_index(drop=True)
        return result

    def assign_entity_id(
        self,
        df: pd.DataFrame,
        entity_type: str,
        source_columns: Optional[List[str]] = None,
        overwrite: bool = False,
    ) -> pd.DataFrame:
        """
        Assign deterministic UUID3 identifiers to entities in a DataFrame.

        Generates identifiers derived from ``source_columns`` values using
        ``uuid.uuid3`` against a fixed GigaSpatial-scoped namespace UUID,
        ensuring the same combination of source values always produces the
        same ID across runs and environments.

        Rows where any ``source_columns`` value is missing fall back to a
        random ``uuid.uuid4`` so no row is ever left without an identifier.
        If ``source_columns`` is empty or ``None``, all rows receive UUID4.

        Existing IDs are preserved unless ``overwrite=True``.

        Args:
            df: Input DataFrame containing entity data.
            entity_type: Entity type name used to derive the ID column name
                (e.g. ``'cell_tower'`` → ``'cell_tower_id'``).
            source_columns: Column names whose concatenated values (joined
                with ``'|'``) seed the UUID3 hash.  Columns absent from
                ``df`` are skipped with a warning and affected rows fall
                back to UUID4.  Defaults to ``None`` (all UUID4).
            overwrite: If ``True``, regenerate IDs for all rows, including
                those that already have a non-null ID.  Defaults to
                ``False``.

        Returns:
            DataFrame with a populated ``{entity_type}_id`` column.

        Raises:
            ValueError: If ``entity_type`` is empty or blank.

        Example::

            df = processor.assign_entity_id(
                df,
                entity_type="cell_tower",
                source_columns=["cell_tower_id_source", "country_iso"],
            )
        """
        if not entity_type or not entity_type.strip():
            raise ValueError("entity_type must not be empty.")

        df = df.copy()
        id_column = f"{entity_type}_id"
        source_columns = source_columns or []

        if id_column not in df.columns:
            df[id_column] = None

        missing_cols = [c for c in source_columns if c not in df.columns]
        if missing_cols:
            logger.warning(
                "Source columns %s not found in DataFrame. "
                "Affected rows will receive random UUID4 identifiers.",
                missing_cols,
            )
            source_columns = [c for c in source_columns if c in df.columns]

        mask = pd.Series(True, index=df.index) if overwrite else df[id_column].isna()

        if not mask.any():
            logger.debug("All rows already have IDs. Nothing to assign.")
            return df

        def generate_id(row: pd.Series) -> str:
            if source_columns:
                values = [row[col] for col in source_columns]
                if all(v is not None and str(v).strip() != "" for v in values):
                    concat = "|".join(str(v).strip() for v in values)
                    return str(uuid.uuid3(ENTITY_UUID_NAMESPACE, concat))
            return str(uuid.uuid4())

        df.loc[mask, id_column] = df.loc[mask].apply(generate_id, axis=1)

        if source_columns:
            has_all = df.loc[mask, source_columns].notna().all(axis=1)
            deterministic = has_all.sum()
            fallback = (~has_all).sum()
        else:
            deterministic, fallback = 0, mask.sum()

        self._log(
            "info",
            "Assigned IDs for %d rows: %d deterministic (UUID3), %d fallback (UUID4).",
            mask.sum(),
            deterministic,
            fallback,
        )
        return df
__init__(verbose=None)

Initialise the processor.

Parameters:

Name Type Description Default
verbose Optional[bool]

Override the class-level verbose setting.

None
Source code in gigaspatial/processing/entity_processor.py
def __init__(self, verbose: Optional[bool] = None):
    """
    Initialise the processor.

    Args:
        verbose: Override the class-level ``verbose`` setting.
    """
    if verbose is not None:
        self.verbose = verbose
    self.processing_logs: List[str] = []
assign_entity_id(df, entity_type, source_columns=None, overwrite=False)

Assign deterministic UUID3 identifiers to entities in a DataFrame.

Generates identifiers derived from source_columns values using uuid.uuid3 against a fixed GigaSpatial-scoped namespace UUID, ensuring the same combination of source values always produces the same ID across runs and environments.

Rows where any source_columns value is missing fall back to a random uuid.uuid4 so no row is ever left without an identifier. If source_columns is empty or None, all rows receive UUID4.

Existing IDs are preserved unless overwrite=True.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing entity data.

required
entity_type str

Entity type name used to derive the ID column name (e.g. 'cell_tower' → 'cell_tower_id').

required
source_columns Optional[List[str]]

Column names whose concatenated values (joined with '|') seed the UUID3 hash. Columns absent from df are skipped with a warning and affected rows fall back to UUID4. Defaults to None (all UUID4).

None
overwrite bool

If True, regenerate IDs for all rows, including those that already have a non-null ID. Defaults to False.

False

Returns:

Type Description
DataFrame

DataFrame with a populated {entity_type}_id column.

Raises:

Type Description
ValueError

If entity_type is empty or blank.

df = processor.assign_entity_id(
    df,
    entity_type="cell_tower",
    source_columns=["cell_tower_id_source", "country_iso"],
)
Source code in gigaspatial/processing/entity_processor.py
def assign_entity_id(
    self,
    df: pd.DataFrame,
    entity_type: str,
    source_columns: Optional[List[str]] = None,
    overwrite: bool = False,
) -> pd.DataFrame:
    """
    Assign deterministic UUID3 identifiers to entities in a DataFrame.

    Generates identifiers derived from ``source_columns`` values using
    ``uuid.uuid3`` against a fixed GigaSpatial-scoped namespace UUID,
    ensuring the same combination of source values always produces the
    same ID across runs and environments.

    Rows where any ``source_columns`` value is missing fall back to a
    random ``uuid.uuid4`` so no row is ever left without an identifier.
    If ``source_columns`` is empty or ``None``, all rows receive UUID4.

    Existing IDs are preserved unless ``overwrite=True``.

    Args:
        df: Input DataFrame containing entity data.
        entity_type: Entity type name used to derive the ID column name
            (e.g. ``'cell_tower'`` → ``'cell_tower_id'``).
        source_columns: Column names whose concatenated values (joined
            with ``'|'``) seed the UUID3 hash.  Columns absent from
            ``df`` are skipped with a warning and affected rows fall
            back to UUID4.  Defaults to ``None`` (all UUID4).
        overwrite: If ``True``, regenerate IDs for all rows, including
            those that already have a non-null ID.  Defaults to
            ``False``.

    Returns:
        DataFrame with a populated ``{entity_type}_id`` column.

    Raises:
        ValueError: If ``entity_type`` is empty or blank.

    Example::

        df = processor.assign_entity_id(
            df,
            entity_type="cell_tower",
            source_columns=["cell_tower_id_source", "country_iso"],
        )
    """
    if not entity_type or not entity_type.strip():
        raise ValueError("entity_type must not be empty.")

    df = df.copy()
    id_column = f"{entity_type}_id"
    source_columns = source_columns or []

    if id_column not in df.columns:
        df[id_column] = None

    missing_cols = [c for c in source_columns if c not in df.columns]
    if missing_cols:
        logger.warning(
            "Source columns %s not found in DataFrame. "
            "Affected rows will receive random UUID4 identifiers.",
            missing_cols,
        )
        source_columns = [c for c in source_columns if c in df.columns]

    mask = pd.Series(True, index=df.index) if overwrite else df[id_column].isna()

    if not mask.any():
        logger.debug("All rows already have IDs. Nothing to assign.")
        return df

    def generate_id(row: pd.Series) -> str:
        if source_columns:
            values = [row[col] for col in source_columns]
            if all(v is not None and str(v).strip() != "" for v in values):
                concat = "|".join(str(v).strip() for v in values)
                return str(uuid.uuid3(ENTITY_UUID_NAMESPACE, concat))
        return str(uuid.uuid4())

    df.loc[mask, id_column] = df.loc[mask].apply(generate_id, axis=1)

    if source_columns:
        has_all = df.loc[mask, source_columns].notna().all(axis=1)
        deterministic = has_all.sum()
        fallback = (~has_all).sum()
    else:
        deterministic, fallback = 0, mask.sum()

    self._log(
        "info",
        "Assigned IDs for %d rows: %d deterministic (UUID3), %d fallback (UUID4).",
        mask.sum(),
        deterministic,
        fallback,
    )
    return df
deduplicate_by_proximity(df, distance_threshold_m=50, lat_col='latitude', lon_col='longitude', keep='first')

Remove near-duplicate rows within a spatial distance threshold.

Uses a greedy forward scan over a KDTree built in UTM projection: each row is kept and all neighbours within distance_threshold_m metres are marked as duplicates. This is significantly faster than an all-pairs approach for large datasets and produces deterministic results given a fixed row order.

Designed for cross-source deduplication at the Silver layer where the same physical site may appear in multiple source files with different identifiers. For ID-based deduplication at the Gold layer use EntityTable.merge(deduplicate_by_id=True) instead.

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing coordinate columns.

required
distance_threshold_m float

Maximum distance in metres below which two rows are considered duplicates. Defaults to 50.

50
lat_col str

Name of the latitude column. Defaults to 'latitude'.

'latitude'
lon_col str

Name of the longitude column. Defaults to 'longitude'.

'longitude'
keep str

Which occurrence to retain when duplicates are found. 'first' retains the earliest row (default); 'last' retains the latest row.

'first'

Returns:

Type Description
DataFrame

DataFrame with near-duplicate rows removed, index reset.

Raises:

Type Description
ValueError

If distance_threshold_m is negative.

ValueError

If keep is not 'first' or 'last'.

ValueError

If lat_col or lon_col are absent from df.

Source code in gigaspatial/processing/entity_processor.py
def deduplicate_by_proximity(
    self,
    df: pd.DataFrame,
    distance_threshold_m: float = 50,
    lat_col: str = "latitude",
    lon_col: str = "longitude",
    keep: str = "first",
) -> pd.DataFrame:
    """
    Remove near-duplicate rows within a spatial distance threshold.

    Uses a greedy forward scan over a KDTree built in UTM projection:
    each row is kept and all neighbours within ``distance_threshold_m``
    metres are marked as duplicates.  This is significantly faster than
    an all-pairs approach for large datasets and produces deterministic
    results given a fixed row order.

    Designed for cross-source deduplication at the Silver layer where
    the same physical site may appear in multiple source files with
    different identifiers.  For ID-based deduplication at the Gold layer
    use ``EntityTable.merge(deduplicate_by_id=True)`` instead.

    Args:
        df: Input DataFrame containing coordinate columns.
        distance_threshold_m: Maximum distance in metres below which two
            rows are considered duplicates. Defaults to ``50``.
        lat_col: Name of the latitude column. Defaults to ``'latitude'``.
        lon_col: Name of the longitude column. Defaults to ``'longitude'``.
        keep: Which occurrence to retain when duplicates are found.
            ``'first'`` retains the earliest row (default);
            ``'last'`` retains the latest row.

    Returns:
        DataFrame with near-duplicate rows removed, index reset.

    Raises:
        ValueError: If ``distance_threshold_m`` is negative.
        ValueError: If ``keep`` is not ``'first'`` or ``'last'``.
        ValueError: If ``lat_col`` or ``lon_col`` are absent from ``df``.
    """
    if distance_threshold_m < 0:
        raise ValueError("distance_threshold_m must be non-negative.")
    if keep not in ("first", "last"):
        raise ValueError(f"keep must be 'first' or 'last', got '{keep}'.")

    missing_cols = [c for c in (lat_col, lon_col) if c not in df.columns]
    if missing_cols:
        raise ValueError(
            f"Coordinate column(s) not found in DataFrame: {missing_cols}. "
            f"Available columns: {df.columns.tolist()}"
        )

    if df.empty:
        return df.reset_index(drop=True)

    working_df = (
        df.iloc[::-1].reset_index(drop=True)
        if keep == "last"
        else df.reset_index(drop=True)
    )

    gdf = convert_to_geodataframe(working_df, lat_col=lat_col, lon_col=lon_col)

    from gigaspatial.processing.geo import estimate_utm_crs_with_fallback

    utm_crs = estimate_utm_crs_with_fallback(gdf)
    coords = gdf.to_crs(utm_crs).get_coordinates().to_numpy()

    tree = cKDTree(coords)
    kept_mask = np.ones(len(coords), dtype=bool)
    for i in range(len(coords)):
        if not kept_mask[i]:
            continue
        for j in tree.query_ball_point(coords[i], r=distance_threshold_m):
            if j != i:
                kept_mask[j] = False

    removed = (~kept_mask).sum()
    if removed:
        self._log(
            "info",
            "%d of %d rows removed as near-duplicates "
            "(distance_threshold_m=%.1f).",
            removed,
            len(df),
            distance_threshold_m,
        )

    result = working_df.loc[kept_mask].reset_index(drop=True)
    if keep == "last":
        result = result.iloc[::-1].reset_index(drop=True)
    return result
process(df, **kwargs)

Run the full cleaning pipeline on a raw DataFrame.

Steps are executed in a fixed order designed to avoid order-dependent bugs (e.g. numeric coercion must follow string stripping; coordinate repair must precede numeric coercion).

Parameters:

Name Type Description Default
df DataFrame

Raw input data. May be a plain pd.DataFrame or a gpd.GeoDataFrame; geometry columns are preserved throughout.

required
**kwargs

country (str): Optional country identifier. When provided: - Rows outside the country boundary are removed via filter_by_country_boundary. - Admin region columns are appended via _annotate_with_admin_regions. Additional kwargs are forwarded to both methods.

{}

Returns:

Type Description
DataFrame

Cleaned pd.DataFrame ready for Pydantic entity validation.

Source code in gigaspatial/processing/entity_processor.py
def process(self, df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """
    Run the full cleaning pipeline on a raw DataFrame.

    Steps are executed in a fixed order designed to avoid order-dependent
    bugs (e.g. numeric coercion must follow string stripping; coordinate
    repair must precede numeric coercion).

    Args:
        df: Raw input data. May be a plain ``pd.DataFrame`` or a
            ``gpd.GeoDataFrame``; geometry columns are preserved throughout.
        **kwargs:
            country (str): Optional country identifier.  When provided:
                - Rows outside the country boundary are removed via
                  ``filter_by_country_boundary``.
                - Admin region columns are appended via
                  ``_annotate_with_admin_regions``.
            Additional kwargs are forwarded to both methods.

    Returns:
        Cleaned ``pd.DataFrame`` ready for Pydantic entity validation.
    """
    self.processing_logs = []  # Clear logs at start of process
    df = df.copy()

    # --- structural ---
    df = self._rename_columns(df)
    df.columns = [c.strip().lower() for c in df.columns]

    # --- string normalisation (geometry-safe, before numeric coercion) ---
    df = self._strip_strings(df)

    # --- null coercion (geometry-safe, on stripped values) ---
    df = self._coerce_nulls(df)

    # --- coordinate repair ---
    df = self._repair_coordinate_columns(df)

    # --- numeric coercion ---
    df = self._coerce_numeric_columns(df)

    # --- casing normalisation ---
    df = self._normalize_casing(df)

    # --- geometry parsing ---
    df = self._parse_geometry(df)

    # --- row cleanup ---
    df = self._drop_empty_rows(df)
    df = self._drop_duplicates(df)

    # --- enrichment ---
    country = kwargs.pop("country", None)
    if country:
        df = self._filter_by_country_boundary(df, boundary=country, **kwargs)
        df = self._annotate_with_admin_regions(df, country, **kwargs)

    # --- final normalization for Pydantic (NaN -> None) ---
    # This allows optional numeric fields with missing values to pass validation.
    df = df.replace({np.nan: None})

    return df
track_changes(func)

Decorator for EntityProcessor methods to automatically log shape changes.

Calculates the difference in rows and columns before and after the decorated method runs, appending a summary to self.processing_logs.

Source code in gigaspatial/processing/entity_processor.py
def track_changes(func):
    """
    Decorator for EntityProcessor methods to automatically log shape changes.

    Calculates the difference in rows and columns before and after the
    decorated method runs, appending a summary to ``self.processing_logs``.
    """

    @wraps(func)
    def wrapper(self, df: pd.DataFrame, *args, **kwargs):
        if not hasattr(self, "processing_logs"):
            return func(self, df, *args, **kwargs)

        initial_rows, initial_cols = df.shape
        initial_col_names = set(df.columns)

        result_df = func(self, df, *args, **kwargs)

        final_rows, final_cols = result_df.shape
        final_col_names = set(result_df.columns)

        dropped_rows = initial_rows - final_rows
        added_cols = final_col_names - initial_col_names
        dropped_cols = initial_col_names - final_col_names

        changes = []
        if dropped_rows > 0:
            changes.append(f"dropped {dropped_rows} rows")
        elif dropped_rows < 0:
            changes.append(f"added {abs(dropped_rows)} rows")

        if added_cols:
            changes.append(f"added columns {sorted(list(added_cols))}")
        if dropped_cols:
            changes.append(f"dropped columns {sorted(list(dropped_cols))}")

        if changes:
            msg = f"STEP [{func.__name__.strip('_')}]: " + ", ".join(changes)
            self.processing_logs.append(msg)

        return result_df

    return wrapper
validate_coordinates(df, lat_col='latitude', lon_col='longitude', drop_invalid=True)

Validate coordinate columns, flagging or dropping invalid rows.

Three checks are performed independently so the log output distinguishes between different classes of data quality issue:

  • Missing: None or NaN in either coordinate column.
  • Out-of-range: latitude outside [-90, 90] or longitude outside [-180, 180].
  • Null island: both coordinates exactly 0.0, which is unambiguously invalid for real-world infrastructure entities.

pd.to_numeric is applied before comparison so the method is safe to call on columns that have not yet been through _coerce_numeric_columns (e.g. during manual Silver inspection).

Parameters:

Name Type Description Default
df DataFrame

Input DataFrame containing coordinate columns.

required
lat_col str

Name of the latitude column. Defaults to 'latitude'.

'latitude'
lon_col str

Name of the longitude column. Defaults to 'longitude'.

'longitude'
drop_invalid bool

If True (default), invalid rows are removed and a summary is logged. If False, an is_valid_coordinate boolean column is appended instead, allowing the caller to inspect and handle invalid rows manually before dropping.

True

Returns:

Type Description
DataFrame
  • drop_invalid=True: DataFrame with invalid rows removed, index reset.
DataFrame
  • drop_invalid=False: Original DataFrame with an appended is_valid_coordinate boolean column.

Raises:

Type Description
ValueError

If lat_col or lon_col are not present in df.

# inspect before dropping
df = processor.validate_coordinates(df, drop_invalid=False)
print(df[~df["is_valid_coordinate"]][["latitude", "longitude"]])

# drop once satisfied
df = df[df["is_valid_coordinate"]].drop(columns="is_valid_coordinate")
Source code in gigaspatial/processing/entity_processor.py
def validate_coordinates(
    self,
    df: pd.DataFrame,
    lat_col: str = "latitude",
    lon_col: str = "longitude",
    drop_invalid: bool = True,
) -> pd.DataFrame:
    """
    Validate coordinate columns, flagging or dropping invalid rows.

    Three checks are performed independently so the log output
    distinguishes between different classes of data quality issue:

    - **Missing**: ``None`` or ``NaN`` in either coordinate column.
    - **Out-of-range**: latitude outside ``[-90, 90]`` or longitude
      outside ``[-180, 180]``.
    - **Null island**: both coordinates exactly ``0.0``, which is
      unambiguously invalid for real-world infrastructure entities.

    ``pd.to_numeric`` is applied before comparison so the method is
    safe to call on columns that have not yet been through
    ``_coerce_numeric_columns`` (e.g. during manual Silver inspection).

    Args:
        df: Input DataFrame containing coordinate columns.
        lat_col: Name of the latitude column. Defaults to ``'latitude'``.
        lon_col: Name of the longitude column. Defaults to ``'longitude'``.
        drop_invalid: If ``True`` (default), invalid rows are removed and
            a summary is logged. If ``False``, an ``is_valid_coordinate``
            boolean column is appended instead, allowing the caller to
            inspect and handle invalid rows manually before dropping.

    Returns:
        - ``drop_invalid=True``: DataFrame with invalid rows removed,
          index reset.
        - ``drop_invalid=False``: Original DataFrame with an appended
          ``is_valid_coordinate`` boolean column.

    Raises:
        ValueError: If ``lat_col`` or ``lon_col`` are not present in ``df``.

    Example::

        # inspect before dropping
        df = processor.validate_coordinates(df, drop_invalid=False)
        print(df[~df["is_valid_coordinate"]][["latitude", "longitude"]])

        # drop once satisfied
        df = df[df["is_valid_coordinate"]].drop(columns="is_valid_coordinate")
    """
    missing_cols = [c for c in (lat_col, lon_col) if c not in df.columns]
    if missing_cols:
        raise ValueError(
            f"Coordinate column(s) not found in DataFrame: {missing_cols}. "
            f"Available columns: {df.columns.tolist()}"
        )

    lat = pd.to_numeric(df[lat_col], errors="coerce")
    lon = pd.to_numeric(df[lon_col], errors="coerce")

    null_mask = lat.isna() | lon.isna()
    range_mask = (lat < -90) | (lat > 90) | (lon < -180) | (lon > 180)
    null_island_mask = (lat == 0.0) & (lon == 0.0)
    invalid_mask = null_mask | range_mask | null_island_mask
    valid_mask = ~invalid_mask

    if null_mask.any():
        logger.warning("%d rows have missing coordinate values.", null_mask.sum())
    if range_mask.any():
        logger.warning(
            "%d rows have out-of-range coordinate values "
            "(lat outside [-90, 90] or lon outside [-180, 180]).",
            range_mask.sum(),
        )
    if null_island_mask.any():
        logger.warning(
            "%d rows have null island coordinates (0.0, 0.0).",
            null_island_mask.sum(),
        )

    total_invalid = invalid_mask.sum()
    if drop_invalid:
        if total_invalid:
            self._log(
                "info",
                "%d of %d rows removed due to invalid coordinates.",
                total_invalid,
                len(df),
            )
        return df.loc[valid_mask].reset_index(drop=True)
    else:
        df = df.copy()
        df["is_valid_coordinate"] = valid_mask
        self._log(
            "info",
            "%d of %d rows flagged as invalid coordinates.",
            total_invalid,
            len(df),
        )
        return df

geo

Geospatial processing utilities. Provides robust helpers for Coordinate Reference System (CRS) management, coordinate detection, spatial joins, buffering, and geometry simplification.

add_area_in_meters(gdf, area_column_name='area_in_meters')

Calculate the area of geometries in square meters and add it as a new column.

Automatically handles UTM transformation for accurate area calculation.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing (Multi)Polygon geometries.

required
area_column_name str

Name of the new column.

'area_in_meters'

Returns:

Type Description
GeoDataFrame

The input GeoDataFrame with an additional area column.

Raises:

Type Description
ValueError

If the input GeoDataFrame contains non-polygon geometries.

Source code in gigaspatial/processing/geo.py
def add_area_in_meters(
    gdf: gpd.GeoDataFrame, area_column_name: str = "area_in_meters"
) -> gpd.GeoDataFrame:
    """
    Calculate the area of geometries in square meters and add it as a new column.

    Automatically handles UTM transformation for accurate area calculation.

    Args:
        gdf: GeoDataFrame containing (Multi)Polygon geometries.
        area_column_name: Name of the new column.

    Returns:
        The input GeoDataFrame with an additional area column.

    Raises:
        ValueError: If the input GeoDataFrame contains non-polygon geometries.
    """
    # Validate input geometries
    if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
        raise ValueError(
            "Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
        )

    # Create a copy of the GeoDataFrame to avoid modifying the original
    gdf_with_area = gdf.copy()

    # Calculate the UTM CRS for accurate area calculation
    try:
        utm_crs = gdf_with_area.estimate_utm_crs()
    except Exception as e:
        LOGGER.warning(
            f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
        )
        utm_crs = "EPSG:3857"  # Fallback to Web Mercator

    # Transform to UTM CRS and calculate the area in square meters
    gdf_with_area[area_column_name] = gdf_with_area.to_crs(utm_crs).geometry.area

    return gdf_with_area

add_spatial_jitter(df, columns=['latitude', 'longitude'], amount=0.0001, seed=None, copy=True)

Add random jitter to duplicated coordinates to separate overlapping points.

Parameters:

Name Type Description Default
df DataFrame

DataFrame containing coordinates.

required
columns List[str]

Column names to jitter.

['latitude', 'longitude']
amount float

Amount of jitter to add.

0.0001
seed

Random seed for reproducibility.

None
copy

Whether to create a copy of the input DataFrame.

True

Returns:

Type Description
DataFrame

DataFrame with jittered coordinates.

Raises:

Type Description
ValueError

If columns don't exist or amount is invalid.

TypeError

If input types are incorrect.

Source code in gigaspatial/processing/geo.py
def add_spatial_jitter(
    df: pd.DataFrame,
    columns: List[str] = ["latitude", "longitude"],
    amount: float = 0.0001,
    seed=None,
    copy=True,
) -> pd.DataFrame:
    """
    Add random jitter to duplicated coordinates to separate overlapping points.

    Args:
        df: DataFrame containing coordinates.
        columns: Column names to jitter.
        amount: Amount of jitter to add.
        seed: Random seed for reproducibility.
        copy: Whether to create a copy of the input DataFrame.

    Returns:
        DataFrame with jittered coordinates.

    Raises:
        ValueError: If columns don't exist or amount is invalid.
        TypeError: If input types are incorrect.
    """

    # Input validation
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    if not all(col in df.columns for col in columns):
        raise ValueError(f"Not all columns {columns} found in DataFrame")

    # Handle jitter amounts
    if isinstance(amount, (int, float)):
        if amount <= 0:
            raise ValueError("Jitter amount must be positive")
        jitter_amounts = {col: amount for col in columns}
    elif isinstance(amount, dict):
        if not all(col in amount for col in columns):
            raise ValueError("Must specify jitter amount for each column")
        if not all(amt > 0 for amt in amount.values()):
            raise ValueError("All jitter amounts must be positive")
        jitter_amounts = amount
    else:
        raise TypeError("amount must be a number or dictionary")

    # Create copy if requested
    df_work = df.copy() if copy else df

    # Set random seed if provided
    if seed is not None:
        np.random.seed(seed)

    try:
        # Find duplicated coordinates
        duplicate_mask = df_work.duplicated(subset=columns, keep=False)
        n_duplicates = duplicate_mask.sum()

        if n_duplicates > 0:
            # Add jitter to each column separately
            for col in columns:
                jitter = np.random.uniform(
                    low=-jitter_amounts[col],
                    high=jitter_amounts[col],
                    size=n_duplicates,
                )
                df_work.loc[duplicate_mask, col] += jitter

            # Validate results (ensure no remaining duplicates)
            if df_work.duplicated(subset=columns, keep=False).any():
                # If duplicates remain, recursively add more jitter
                df_work = add_spatial_jitter(
                    df_work,
                    columns=columns,
                    amount={col: amt * 2 for col, amt in jitter_amounts.items()},
                    seed=seed,
                    copy=False,
                )

        return df_work

    except Exception as e:
        raise RuntimeError(f"Error during jittering operation: {str(e)}")

aggregate_points_to_zones(points, zones, value_columns=None, aggregation='count', point_zone_predicate='within', zone_id_column='zone_id', output_suffix='', drop_geometry=False)

Aggregate point data to zones with flexible aggregation methods.

For zones with no overlapping points: - "count" aggregation fills missing values with 0. - All other aggregations fill missing values with np.nan.

Parameters:

Name Type Description Default
points Union[DataFrame, GeoDataFrame]

Point data to aggregate.

required
zones GeoDataFrame

Zones to aggregate points to.

required
value_columns Optional[Union[str, List[str]]]

Column(s) containing values to aggregate.

None
aggregation Union[str, Dict[str, str]]

Aggregation method(s) to use.

'count'
point_zone_predicate str

Spatial predicate (e.g., 'within', 'intersects').

'within'
zone_id_column str

Column in zones containing zone identifiers.

'zone_id'
output_suffix str

Suffix to add to output column names.

''
drop_geometry bool

Whether to drop the geometry column from output.

False

Returns:

Type Description
GeoDataFrame

Zones with aggregated point values.

Raises:

Type Description
TypeError

If zones is not a GeoDataFrame or aggregation is invalid.

ValueError

If columns are missing or metadata is inconsistent.

Source code in gigaspatial/processing/geo.py
def aggregate_points_to_zones(
    points: Union[pd.DataFrame, gpd.GeoDataFrame],
    zones: gpd.GeoDataFrame,
    value_columns: Optional[Union[str, List[str]]] = None,
    aggregation: Union[str, Dict[str, str]] = "count",
    point_zone_predicate: str = "within",
    zone_id_column: str = "zone_id",
    output_suffix: str = "",
    drop_geometry: bool = False,
) -> gpd.GeoDataFrame:
    """
    Aggregate point data to zones with flexible aggregation methods.

    For zones with no overlapping points:
    - ``"count"`` aggregation fills missing values with ``0``.
    - All other aggregations fill missing values with ``np.nan``.

    Args:
        points: Point data to aggregate.
        zones: Zones to aggregate points to.
        value_columns: Column(s) containing values to aggregate.
        aggregation: Aggregation method(s) to use.
        point_zone_predicate: Spatial predicate (e.g., 'within', 'intersects').
        zone_id_column: Column in zones containing zone identifiers.
        output_suffix: Suffix to add to output column names.
        drop_geometry: Whether to drop the geometry column from output.

    Returns:
        Zones with aggregated point values.

    Raises:
        TypeError: If zones is not a GeoDataFrame or aggregation is invalid.
        ValueError: If columns are missing or metadata is inconsistent.
    """
    # --- Input validation ---
    if not isinstance(zones, gpd.GeoDataFrame):
        raise TypeError("zones must be a GeoDataFrame")

    if zone_id_column not in zones.columns:
        raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")

    # --- Normalise points ---
    points_gdf = (
        convert_to_geodataframe(points)
        if not isinstance(points, gpd.GeoDataFrame)
        else points.copy()
    )

    # --- CRS alignment ---
    if points_gdf.crs != zones.crs:
        points_gdf = points_gdf.to_crs(zones.crs)

    # --- Normalise value_columns ---
    if isinstance(value_columns, str):
        value_columns = [value_columns]

    if value_columns is not None:
        missing_cols = [col for col in value_columns if col not in points_gdf.columns]
        if missing_cols:
            raise ValueError(f"Value columns not found in points data: {missing_cols}")

    # --- Build agg_funcs and per-output-column method lookup ---
    agg_funcs: Dict[str, str] = {}
    # Maps output column name → aggregation method for fill-value decisions
    agg_method_for_output_col: Dict[str, str] = {}

    if isinstance(aggregation, str):
        if aggregation == "count":
            agg_funcs["__count"] = "count"
        elif value_columns is not None:
            agg_funcs = {col: aggregation for col in value_columns}
            agg_method_for_output_col = {
                f"{col}{output_suffix}": aggregation for col in value_columns
            }
        else:
            raise ValueError(
                "value_columns must be specified for aggregation methods other than 'count'"
            )
    elif isinstance(aggregation, dict):
        if value_columns is None:
            raise ValueError(
                "value_columns must be specified when using a dict of aggregation methods"
            )
        missing_aggs = [col for col in value_columns if col not in aggregation]
        extra_aggs = [col for col in aggregation if col not in value_columns]
        if missing_aggs:
            raise ValueError(f"Missing aggregation methods for columns: {missing_aggs}")
        if extra_aggs:
            raise ValueError(
                f"Aggregation methods specified for columns not in value_columns: {extra_aggs}"
            )
        agg_funcs = dict(aggregation)
        agg_method_for_output_col = {
            f"{col}{output_suffix}": method for col, method in aggregation.items()
        }
    else:
        raise TypeError("aggregation must be a str or dict")

    # --- Spatial join ---
    result = zones.copy()
    joined = gpd.sjoin(points_gdf, zones, how="inner", predicate=point_zone_predicate)

    # --- Aggregation ---
    if "__count" in agg_funcs:
        counts = (
            joined.groupby(zone_id_column)
            .size()
            .reset_index(name=f"point_count{output_suffix}")
        )
        result = result.merge(counts, on=zone_id_column, how="left")
        result[f"point_count{output_suffix}"] = (
            result[f"point_count{output_suffix}"].fillna(0).astype(int)
        )
    else:
        # Drop geometry before non-count aggregations to avoid errors
        if "geometry" in joined.columns:
            joined = joined.drop(columns=["geometry"])

        aggregated = joined.groupby(zone_id_column).agg(agg_funcs).reset_index()

        # Flatten MultiIndex columns produced by some pandas agg paths
        if isinstance(aggregated.columns, pd.MultiIndex):
            aggregated.columns = [
                (
                    f"{col[0]}_{col[1]}{output_suffix}"
                    if col[0] != zone_id_column
                    else zone_id_column
                )
                for col in aggregated.columns
            ]
        else:
            # Single-level: rename value columns to include suffix
            aggregated = aggregated.rename(
                columns={
                    col: f"{col}{output_suffix}"
                    for col in aggregated.columns
                    if col != zone_id_column
                }
            )

        result = result.merge(aggregated, on=zone_id_column, how="left")

        # -------------------------------------------------------
        # Fill with 0 only for 'count', NaN for everything
        # else so zones with no overlapping points are distinguishable
        # from zones whose true aggregated value is zero.
        # -------------------------------------------------------
        for col in result.columns:
            if col in (zone_id_column, "geometry"):
                continue
            if not pd.api.types.is_numeric_dtype(result[col]):
                continue
            method = agg_method_for_output_col.get(col, "")
            fill_value = 0 if method == "count" else np.nan
            result[col] = result[col].fillna(fill_value)

    if drop_geometry:
        result = result.drop(columns=["geometry"])

    return result

aggregate_polygons_to_zones(polygons, zones, value_columns, aggregation='sum', predicate='intersects', zone_id_column='zone_id', output_suffix='', drop_geometry=False)

Aggregates polygon data to zones based on a spatial relationship.

Parameters:

Name Type Description Default
polygons Union[DataFrame, GeoDataFrame]

Polygon data to aggregate.

required
zones GeoDataFrame

Target zones.

required
value_columns Union[str, List[str]]

Column(s) in polygons with numeric values.

required
aggregation Union[str, Dict[str, str]]

Aggregation method(s) to use (str or dict).

'sum'
predicate Literal['intersects', 'within', 'fractional']

Spatial relationship ('intersects', 'within', 'fractional').

'intersects'
zone_id_column str

Unique identifier column in zones.

'zone_id'
output_suffix str

Suffix for output columns.

''
drop_geometry bool

Whether to drop the geometry column.

False

Returns:

Type Description
GeoDataFrame

The zones GeoDataFrame with aggregated values.

Raises:

Type Description
TypeError

If polygons/zones are not GeoDataFrames.

ValueError

If columns or predicates are invalid.

Source code in gigaspatial/processing/geo.py
def aggregate_polygons_to_zones(
    polygons: Union[pd.DataFrame, gpd.GeoDataFrame],
    zones: gpd.GeoDataFrame,
    value_columns: Union[str, List[str]],
    aggregation: Union[str, Dict[str, str]] = "sum",
    predicate: Literal["intersects", "within", "fractional"] = "intersects",
    zone_id_column: str = "zone_id",
    output_suffix: str = "",
    drop_geometry: bool = False,
) -> gpd.GeoDataFrame:
    """
    Aggregates polygon data to zones based on a spatial relationship.

    Args:
        polygons: Polygon data to aggregate.
        zones: Target zones.
        value_columns: Column(s) in polygons with numeric values.
        aggregation: Aggregation method(s) to use (str or dict).
        predicate: Spatial relationship ('intersects', 'within', 'fractional').
        zone_id_column: Unique identifier column in zones.
        output_suffix: Suffix for output columns.
        drop_geometry: Whether to drop the geometry column.

    Returns:
        The zones GeoDataFrame with aggregated values.

    Raises:
        TypeError: If polygons/zones are not GeoDataFrames.
        ValueError: If columns or predicates are invalid.
    """
    # --- Input validation ---
    if not isinstance(zones, gpd.GeoDataFrame):
        raise TypeError("zones must be a GeoDataFrame")

    if zones.empty:
        raise ValueError("zones GeoDataFrame is empty")

    if zone_id_column not in zones.columns:
        raise ValueError(f"Zone ID column '{zone_id_column}' not found in zones")

    if predicate not in ["intersects", "within", "fractional"]:
        raise ValueError(
            f"Unsupported predicate: '{predicate}'. "
            "Must be one of: 'intersects', 'within', 'fractional'."
        )

    # --- Normalise polygons ---
    if not isinstance(polygons, gpd.GeoDataFrame):
        try:
            polygons_gdf = convert_to_geodataframe(polygons)
        except Exception as e:
            raise TypeError(
                f"polygons must be a GeoDataFrame or convertible to one: {e}"
            )
    else:
        polygons_gdf = polygons.copy()

    if polygons_gdf.empty:
        LOGGER.warning("Empty polygons GeoDataFrame provided")
        return zones

    # --- Geometry type validation ---
    non_polygon_geoms = [
        geom_type
        for geom_type in polygons_gdf.geometry.geom_type.unique()
        if geom_type not in ["Polygon", "MultiPolygon"]
    ]
    if non_polygon_geoms:
        raise ValueError(
            f"Input contains non-polygon geometries: {non_polygon_geoms}. "
            "Use aggregate_points_to_zones for point data."
        )

    # --- Normalise value_columns ---
    if isinstance(value_columns, str):
        value_columns = [value_columns]

    missing_cols = [col for col in value_columns if col not in polygons_gdf.columns]
    if missing_cols:
        raise ValueError(f"Value columns not found in polygons data: {missing_cols}")

    if zone_id_column in polygons_gdf.columns:
        raise ValueError(
            f"Column name conflict: polygons DataFrame contains column '{zone_id_column}' "
            "which conflicts with the zone identifier column. "
            "Please rename this column in the polygons data."
        )

    # --- CRS alignment ---
    if polygons_gdf.crs != zones.crs:
        polygons_gdf = polygons_gdf.to_crs(zones.crs)

    # --- Build aggregation functions ---
    agg_funcs = _process_aggregation_methods(aggregation, value_columns)

    # Build lookup: original col name → method (before suffix is applied)
    # Used below to decide fill value per column.
    if isinstance(aggregation, str):
        agg_method_for_col: Dict[str, str] = {col: aggregation for col in value_columns}
    else:
        agg_method_for_col = dict(aggregation)

    # --- Spatial aggregation ---
    minimal_zones = zones[[zone_id_column, "geometry"]].copy()

    if predicate == "fractional":
        aggregated_data = _fractional_aggregation(
            polygons_gdf, minimal_zones, value_columns, agg_funcs, zone_id_column
        )
    else:
        aggregated_data = _simple_aggregation(
            polygons_gdf,
            minimal_zones,
            value_columns,
            agg_funcs,
            zone_id_column,
            predicate,
        )

    # --- Merge back to full zones ---
    result = zones.merge(
        aggregated_data[[col for col in aggregated_data.columns if col != "geometry"]],
        on=zone_id_column,
        how="left",
    )

    # --- Fill NaN values: 0 for count, np.nan for everything else ---
    # NOTE: output_suffix has NOT been applied yet, so column names here
    # still match the keys in agg_method_for_col.
    aggregated_cols = [col for col in result.columns if col not in zones.columns]
    for col in aggregated_cols:
        if not pd.api.types.is_numeric_dtype(result[col]):
            continue
        method = agg_method_for_col.get(col, "")
        fill_value = 0 if method == "count" else np.nan
        result[col] = result[col].fillna(fill_value)

    # --- Apply output suffix ---
    if output_suffix:
        rename_dict = {col: f"{col}{output_suffix}" for col in aggregated_cols}
        result = result.rename(columns=rename_dict)

    if drop_geometry:
        result = result.drop(columns=["geometry"])

    return result

annotate_with_admin_regions(gdf, country_code, data_store=None, admin_id_column_suffix='')

Annotate a GeoDataFrame with administrative region information.

Performs a spatial join between the input points and administrative boundaries at levels 1 and 2, resolving conflicts when points intersect multiple regions.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing points to annotate.

required
country_code str

ISO country code for administrative boundaries.

required
data_store Optional[DataStore]

Optional DataStore for loading boundary data.

None
admin_id_column_suffix

Optional suffix for admin ID columns.

''

Returns:

Type Description
GeoDataFrame

GeoDataFrame with added administrative region columns (admin1, admin2, etc.).

Raises:

Type Description
TypeError

If gdf is not a GeoDataFrame.

Source code in gigaspatial/processing/geo.py
def annotate_with_admin_regions(
    gdf: gpd.GeoDataFrame,
    country_code: str,
    data_store: Optional[DataStore] = None,
    admin_id_column_suffix="",
) -> gpd.GeoDataFrame:
    """
    Annotate a GeoDataFrame with administrative region information.

    Performs a spatial join between the input points and administrative boundaries
    at levels 1 and 2, resolving conflicts when points intersect multiple regions.

    Args:
        gdf: GeoDataFrame containing points to annotate.
        country_code: ISO country code for administrative boundaries.
        data_store: Optional DataStore for loading boundary data.
        admin_id_column_suffix: Optional suffix for admin ID columns.

    Returns:
        GeoDataFrame with added administrative region columns (admin1, admin2, etc.).

    Raises:
        TypeError: If gdf is not a GeoDataFrame.
    """
    from gigaspatial.handlers.boundaries import AdminBoundaries

    if not isinstance(gdf, gpd.GeoDataFrame):
        raise TypeError("gdf must be a GeoDataFrame")

    if gdf.empty:
        LOGGER.warning("Empty GeoDataFrame provided, returning as-is")
        return gdf

    # read country admin data
    admin1_data = AdminBoundaries.create(
        country_code=country_code, admin_level=1, data_store=data_store
    ).to_geodataframe()

    admin1_data.rename(
        columns={"boundary_id": f"admin1_id{admin_id_column_suffix}", "name": "admin1"},
        inplace=True,
    )
    admin1_data = admin1_data[[f"admin1_id{admin_id_column_suffix}", "admin1", "geometry"]]

    admin2_data = AdminBoundaries.create(
        country_code=country_code, admin_level=2, data_store=data_store
    ).to_geodataframe()

    admin2_data.rename(
        columns={
            "boundary_id": f"admin2_id{admin_id_column_suffix}",
            "parent_id": f"admin1_id{admin_id_column_suffix}",
            "name": "admin2",
        },
        inplace=True,
    )
    admin2_data = admin2_data[
        [
            f"admin2_id{admin_id_column_suffix}",
            "admin2",
            f"admin1_id{admin_id_column_suffix}",
            "geometry"
        ]
    ]

    # Join dataframes based on 'admin1_id_giga'
    admin_data = admin2_data.merge(
        admin1_data,
        left_on=f"admin1_id{admin_id_column_suffix}",
        right_on=f"admin1_id{admin_id_column_suffix}",
        how="outer",
    )

    admin_data["geometry"] = admin_data.apply(
        lambda x: x.geometry_x if x.geometry_x else x.geometry_y, axis=1
    )

    admin_data = gpd.GeoDataFrame(
        admin_data.drop(columns=["geometry_x", "geometry_y"]),
        geometry="geometry",
        crs=4326,
    )

    # admin_data["admin2"].fillna("Unknown", inplace=True)
    admin_data[f"admin2_id{admin_id_column_suffix}"] = admin_data[
        f"admin2_id{admin_id_column_suffix}"
    ].replace({np.nan: None})

    if gdf.crs is None:
        LOGGER.warning("Input GeoDataFrame has no CRS, assuming EPSG:4326")
        gdf.set_crs(epsg=4326, inplace=True)
    elif gdf.crs != "EPSG:4326":
        LOGGER.info(f"Reprojecting from {gdf.crs} to EPSG:4326")
        gdf = gdf.to_crs(epsg=4326)

    # spatial join gdf to admins
    gdf_w_admins = gdf.copy().sjoin(
        admin_data,
        how="left",
        predicate="intersects",
    )

    # Check for duplicates caused by points intersecting multiple polygons
    if len(gdf_w_admins) != len(gdf):
        LOGGER.warning(
            "Some points intersect multiple administrative boundaries. Resolving conflicts..."
        )

        # Group by original index and select the closest admin area for ties
        gdf_w_admins["distance"] = gdf_w_admins.apply(
            lambda row: row.geometry.distance(
                admin_data.loc[row.index_right, "geometry"].centroid
            ),
            axis=1,
        )

        # For points with multiple matches, keep the closest polygon
        gdf_w_admins = gdf_w_admins.loc[
            gdf_w_admins.groupby(gdf.index)["distance"].idxmin()
        ].drop(columns="distance")

    # Drop unnecessary columns and reset the index
    gdf_w_admins = gdf_w_admins.drop(columns="index_right").reset_index(drop=True)

    return gdf_w_admins

buffer_geodataframe(gdf, buffer_distance_meters, cap_style='round', copy=True)

Buffer a GeoDataFrame with a distance in meters.

Automatically handles UTM transformation for accurate buffering.

Parameters:

Name Type Description Default
gdf GeoDataFrame

The GeoDataFrame to be buffered.

required
buffer_distance_meters Union[float, array, Series]

The buffer distance.

required
cap_style Literal['round', 'square', 'flat']

Style of caps ('round', 'square', 'flat').

'round'
copy

Whether to create a copy of the input.

True

Returns:

Type Description
GeoDataFrame

The buffered GeoDataFrame in the original CRS.

Source code in gigaspatial/processing/geo.py
def buffer_geodataframe(
    gdf: gpd.GeoDataFrame,
    buffer_distance_meters: Union[float, np.array, pd.Series],
    cap_style: Literal["round", "square", "flat"] = "round",
    copy=True,
) -> gpd.GeoDataFrame:
    """
    Buffer a GeoDataFrame with a distance in meters.

    Automatically handles UTM transformation for accurate buffering.

    Args:
        gdf: The GeoDataFrame to be buffered.
        buffer_distance_meters: The buffer distance.
        cap_style: Style of caps ('round', 'square', 'flat').
        copy: Whether to create a copy of the input.

    Returns:
        The buffered GeoDataFrame in the original CRS.
    """

    # Input validation
    if not isinstance(gdf, gpd.GeoDataFrame):
        raise TypeError("Input must be a GeoDataFrame")

    if cap_style not in ["round", "square", "flat"]:
        raise ValueError("cap_style must be round, flat or square.")

    if gdf.crs is None:
        raise ValueError("Input GeoDataFrame must have a defined CRS")

    # Create a copy if requested
    gdf_work = gdf.copy() if copy else gdf

    # Store input CRS
    input_crs = gdf_work.crs

    try:
        try:
            utm_crs = gdf_work.estimate_utm_crs()
        except Exception as e:
            LOGGER.warning(
                f"Warning: UTM CRS estimation failed, using Web Mercator. Error: {e}"
            )
            utm_crs = "EPSG:3857"  # Fallback to Web Mercator

        # Transform to UTM, create buffer, and transform back
        gdf_work = gdf_work.to_crs(utm_crs)
        gdf_work["geometry"] = gdf_work["geometry"].buffer(
            distance=buffer_distance_meters, cap_style=cap_style
        )
        gdf_work = gdf_work.to_crs(input_crs)

        return gdf_work

    except Exception as e:
        raise RuntimeError(f"Error during buffering operation: {str(e)}")

calculate_distance(lat1, lon1, lat2, lon2, R=6371000.0)

Calculate the Haversine distance between two points.

Parameters:

Name Type Description Default
lat1

Latitude of point 1.

required
lon1

Longitude of point 1.

required
lat2

Latitude of point 2.

required
lon2

Longitude of point 2.

required
R

Earth radius in meters.

6371000.0

Returns:

Type Description

Distance in meters.

Source code in gigaspatial/processing/geo.py
def calculate_distance(lat1, lon1, lat2, lon2, R=6371e3):
    """
    Calculate the Haversine distance between two points.

    Args:
        lat1: Latitude of point 1.
        lon1: Longitude of point 1.
        lat2: Latitude of point 2.
        lon2: Longitude of point 2.
        R: Earth radius in meters.

    Returns:
        Distance in meters.
    """
    lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    distance = R * c
    return distance

convert_to_geodataframe(data, lat_col=None, lon_col=None, crs='EPSG:4326')

Convert a pandas DataFrame to a GeoDataFrame.

Supports conversion from latitude/longitude columns or WKT/geometry columns.

Parameters:

Name Type Description Default
data DataFrame

Input DataFrame.

required
lat_col str

Name of the latitude column.

None
lon_col str

Name of the longitude column.

None
crs

Coordinate Reference System. Defaults to 'EPSG:4326'.

'EPSG:4326'

Returns:

Type Description
GeoDataFrame

A GeoDataFrame containing the input data with a geometry column.

Raises:

Type Description
TypeError

If input is not a pandas DataFrame.

ValueError

If required columns are missing or invalid.

Source code in gigaspatial/processing/geo.py
def convert_to_geodataframe(
    data: pd.DataFrame, lat_col: str = None, lon_col: str = None, crs="EPSG:4326"
) -> gpd.GeoDataFrame:
    """
    Convert a pandas DataFrame to a GeoDataFrame.

    Supports conversion from latitude/longitude columns or WKT/geometry columns.

    Args:
        data: Input DataFrame.
        lat_col: Name of the latitude column.
        lon_col: Name of the longitude column.
        crs: Coordinate Reference System. Defaults to 'EPSG:4326'.

    Returns:
        A GeoDataFrame containing the input data with a geometry column.

    Raises:
        TypeError: If input is not a pandas DataFrame.
        ValueError: If required columns are missing or invalid.
    """

    # Input validation
    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input 'data' must be a pandas DataFrame")

    # Create a copy to avoid modifying the input
    df = data.copy()

    try:
        if "geometry" not in df.columns:
            # If column names not provided, try to detect them
            if lat_col is None or lon_col is None:
                try:
                    detected_lat, detected_lon = detect_coordinate_columns(df)
                    lat_col = lat_col or detected_lat
                    lon_col = lon_col or detected_lon
                except ValueError as e:
                    raise ValueError(
                        f"Could not automatically detect coordinate columns and no "
                        f"'geometry' column found. Error: {str(e)}"
                    )

            # Validate latitude/longitude columns exist
            if lat_col not in df.columns or lon_col not in df.columns:
                raise ValueError(
                    f"Could not find columns: {lat_col} and/or {lon_col} in the DataFrame"
                )

            # Check for missing values
            if df[lat_col].isna().any() or df[lon_col].isna().any():
                raise ValueError(
                    f"Missing values found in {lat_col} and/or {lon_col} columns"
                )

            # Create geometry from lat/lon
            geometry = gpd.points_from_xy(x=df[lon_col], y=df[lat_col])

        else:
            # Check if geometry column already contains valid geometries
            if df["geometry"].apply(lambda x: isinstance(x, base.BaseGeometry)).all():
                geometry = df["geometry"]
            elif df["geometry"].apply(lambda x: isinstance(x, str)).all():
                # Convert WKT strings to geometry objects
                geometry = df["geometry"].apply(wkt.loads)
            else:
                raise ValueError(
                    "Invalid geometry format: contains mixed or unsupported types"
                )

        # drop the WKT column if conversion was done
        if (
            "geometry" in df.columns
            and not df["geometry"]
            .apply(lambda x: isinstance(x, base.BaseGeometry))
            .all()
        ):
            df = df.drop(columns=["geometry"])

        return gpd.GeoDataFrame(df, geometry=geometry, crs=crs)

    except Exception as e:
        raise RuntimeError(f"Error converting to GeoDataFrame: {str(e)}")

detect_coordinate_columns(data, lat_keywords=None, lon_keywords=None, case_sensitive=False)

Detect latitude and longitude columns using keyword matching.

Parameters:

Name Type Description Default
data

DataFrame to search for coordinate columns.

required
lat_keywords

Keywords for identifying latitude columns.

None
lon_keywords

Keywords for identifying longitude columns.

None
case_sensitive

Whether to perform case-sensitive matching.

False

Returns:

Type Description
Tuple[str, str]

Tuple of (latitude_column_name, longitude_column_name).

Raises:

Type Description
ValueError

If no unique pair of latitude/longitude columns can be found.

TypeError

If input data is not a pandas DataFrame.

Source code in gigaspatial/processing/geo.py
def detect_coordinate_columns(
    data, lat_keywords=None, lon_keywords=None, case_sensitive=False
) -> Tuple[str, str]:
    """
    Detect latitude and longitude columns using keyword matching.

    Args:
        data: DataFrame to search for coordinate columns.
        lat_keywords: Keywords for identifying latitude columns.
        lon_keywords: Keywords for identifying longitude columns.
        case_sensitive: Whether to perform case-sensitive matching.

    Returns:
        Tuple of (latitude_column_name, longitude_column_name).

    Raises:
        ValueError: If no unique pair of latitude/longitude columns can be found.
        TypeError: If input data is not a pandas DataFrame.
    """

    # Default keywords if none provided
    default_lat = [
        "latitude",
        "lat",
        "y",
        "lat_",
        "lat(s)",
        "_lat",
        "ylat",
        "latitude_y",
    ]
    default_lon = [
        "longitude",
        "lon",
        "long",
        "x",
        "lon_",
        "lon(e)",
        "long(e)",
        "_lon",
        "xlon",
        "longitude_x",
    ]

    lat_keywords = lat_keywords or default_lat
    lon_keywords = lon_keywords or default_lon

    # Input validation
    if not isinstance(data, pd.DataFrame):
        raise TypeError("Input must be a pandas DataFrame")

    if not data.columns.is_unique:
        raise ValueError("DataFrame contains duplicate column names")

    def create_pattern(keywords):
        """Create regex pattern from keywords."""
        return "|".join(rf"\b{re.escape(keyword)}\b" for keyword in keywords)

    def find_matching_columns(columns, pattern, case_sensitive) -> List:
        """Find columns matching the pattern."""
        flags = 0 if case_sensitive else re.IGNORECASE
        return [col for col in columns if re.search(pattern, col, flags=flags)]

    try:
        # Create patterns
        lat_pattern = create_pattern(lat_keywords)
        lon_pattern = create_pattern(lon_keywords)

        # Find matching columns
        lat_cols = find_matching_columns(data.columns, lat_pattern, case_sensitive)
        lon_cols = find_matching_columns(data.columns, lon_pattern, case_sensitive)

        # Remove any longitude matches from latitude columns and vice versa
        lat_cols = [col for col in lat_cols if col not in lon_cols]
        lon_cols = [col for col in lon_cols if col not in lat_cols]

        # Detailed error messages based on what was found
        if not lat_cols and not lon_cols:
            columns_list = "\n".join(f"- {col}" for col in data.columns)
            raise ValueError(
                f"No latitude or longitude columns found. Available columns are:\n{columns_list}\n"
                f"Consider adding more keywords or checking column names."
            )

        if not lat_cols:
            found_lons = ", ".join(lon_cols)
            raise ValueError(
                f"Found longitude columns ({found_lons}) but no latitude columns. "
                "Check latitude keywords or column names."
            )

        if not lon_cols:
            found_lats = ", ".join(lat_cols)
            raise ValueError(
                f"Found latitude columns ({found_lats}) but no longitude columns. "
                "Check longitude keywords or column names."
            )

        if len(lat_cols) > 1 or len(lon_cols) > 1:
            raise ValueError(
                f"Multiple possible coordinate columns found:\n"
                f"Latitude candidates: {lat_cols}\n"
                f"Longitude candidates: {lon_cols}\n"
                "Please specify more precise keywords."
            )

        return lat_cols[0], lon_cols[0]

    except Exception as e:
        if isinstance(e, ValueError):
            raise
        raise RuntimeError(f"Error detecting coordinate columns: {str(e)}")

estimate_utm_crs_with_fallback(gdf, logger=LOGGER, fallback_crs='EPSG:3857')

Robustly estimate an appropriate UTM CRS for a GeoDataFrame.

This helper wraps GeoDataFrame.estimate_utm_crs and falls back to a configurable CRS (default: Web Mercator) when estimation fails or returns None.

Parameters:

Name Type Description Default
gdf GeoDataFrame

Input GeoDataFrame used to estimate a suitable UTM CRS.

required
logger

Optional logger used to emit warnings when falling back.

LOGGER
fallback_crs str

CRS to use when UTM estimation fails. Defaults to "EPSG:3857".

'EPSG:3857'

Returns:

Type Description

A CRS object or string suitable for GeoDataFrame.to_crs.

Source code in gigaspatial/processing/geo.py
def estimate_utm_crs_with_fallback(
    gdf: gpd.GeoDataFrame,
    logger=LOGGER,
    fallback_crs: str = "EPSG:3857",
):
    """
    Robustly estimate an appropriate UTM CRS for a GeoDataFrame.

    This helper wraps ``GeoDataFrame.estimate_utm_crs`` and falls back to a
    configurable CRS (default: Web Mercator) when estimation fails or returns
    ``None``.

    Args:
        gdf: Input GeoDataFrame used to estimate a suitable UTM CRS.
        logger: Optional logger used to emit warnings when falling back.
        fallback_crs: CRS to use when UTM estimation fails. Defaults to "EPSG:3857".

    Returns:
        A CRS object or string suitable for ``GeoDataFrame.to_crs``.
    """
    if gdf is None or gdf.empty:
        if logger is not None:
            logger.warning(
                "UTM CRS estimation requested for an empty GeoDataFrame; "
                f"falling back to {fallback_crs}."
            )
        return fallback_crs

    try:
        utm_crs = gdf.estimate_utm_crs()
    except Exception as e:
        if logger is not None:
            logger.warning(
                f"UTM CRS estimation failed, using fallback CRS {fallback_crs}. "
                f"Error: {e}"
            )
        utm_crs = None

    if not utm_crs:
        if logger is not None:
            logger.warning(
                f"UTM CRS estimation returned None, using fallback CRS {fallback_crs}."
            )
        utm_crs = fallback_crs

    return utm_crs

get_centroids(gdf)

Calculate the centroids of a (Multi)Polygon GeoDataFrame.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing (Multi)Polygon geometries.

required

Returns:

Type Description
GeoDataFrame

A new GeoDataFrame with Point geometries representing the centroids.

Raises:

Type Description
ValueError

If the input GeoDataFrame contains non-polygon geometries.

Source code in gigaspatial/processing/geo.py
def get_centroids(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Calculate the centroids of a (Multi)Polygon GeoDataFrame.

    Args:
        gdf: GeoDataFrame containing (Multi)Polygon geometries.

    Returns:
        A new GeoDataFrame with Point geometries representing the centroids.

    Raises:
        ValueError: If the input GeoDataFrame contains non-polygon geometries.
    """
    # Validate input geometries
    if not all(gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
        raise ValueError(
            "Input GeoDataFrame must contain only Polygon or MultiPolygon geometries."
        )

    # Calculate centroids
    centroids = gdf.copy()
    centroids["geometry"] = centroids.geometry.centroid

    return centroids

map_points_within_polygons(base_points_gdf, polygon_gdf)

Map whether each point is within any polygon.

Parameters:

Name Type Description Default
base_points_gdf

GeoDataFrame containing points.

required
polygon_gdf

GeoDataFrame containing polygons.

required

Returns:

Type Description

The base_points_gdf with an additional is_within boolean column.

Raises:

Type Description
ValueError

If geometries are invalid or match is impossible (CRS mismatch).

Source code in gigaspatial/processing/geo.py
def map_points_within_polygons(base_points_gdf, polygon_gdf):
    """
    Map whether each point is within any polygon.

    Args:
        base_points_gdf: GeoDataFrame containing points.
        polygon_gdf: GeoDataFrame containing polygons.

    Returns:
        The `base_points_gdf` with an additional `is_within` boolean column.

    Raises:
        ValueError: If geometries are invalid or match is impossible (CRS mismatch).
    """
    # Validate input GeoDataFrames
    if not all(base_points_gdf.geometry.geom_type == "Point"):
        raise ValueError("`base_points_gdf` must contain only Point geometries.")
    if not all(polygon_gdf.geometry.geom_type.isin(["Polygon", "MultiPolygon"])):
        raise ValueError(
            "`polygon_gdf` must contain only Polygon or MultiPolygon geometries."
        )

    if not base_points_gdf.crs == polygon_gdf.crs:
        raise ValueError("CRS of `base_points_gdf` and `polygon_gdf` must match.")

    # Perform spatial join to check if points fall within any polygon
    joined_gdf = gpd.sjoin(
        base_points_gdf, polygon_gdf[["geometry"]], how="left", predicate="within"
    )

    # Add `is_within` column to base_points_gdf
    base_points_gdf["is_within"] = base_points_gdf.index.isin(
        set(joined_gdf.index[~joined_gdf.index_right.isna()])
    )

    return base_points_gdf

simplify_geometries(gdf, tolerance=0.01, preserve_topology=True, geometry_column='geometry')

Simplify geometries to reduce file size and improve performance.

Parameters:

Name Type Description Default
gdf GeoDataFrame

GeoDataFrame containing geometries to simplify.

required
tolerance float

Tolerance for simplification.

0.01
preserve_topology bool

Whether to preserve topology.

True
geometry_column str

Name of the geometry column.

'geometry'

Returns:

Type Description
GeoDataFrame

A new GeoDataFrame with simplified geometries.

Raises:

Type Description
ValueError

If geometry column is missing or invalid.

Source code in gigaspatial/processing/geo.py
def simplify_geometries(
    gdf: gpd.GeoDataFrame,
    tolerance: float = 0.01,
    preserve_topology: bool = True,
    geometry_column: str = "geometry",
) -> gpd.GeoDataFrame:
    """
    Simplify geometries to reduce file size and improve performance.

    Args:
        gdf: GeoDataFrame containing geometries to simplify.
        tolerance: Tolerance for simplification.
        preserve_topology: Whether to preserve topology.
        geometry_column: Name of the geometry column.

    Returns:
        A new GeoDataFrame with simplified geometries.

    Raises:
        ValueError: If geometry column is missing or invalid.
    """

    # Check if the specified geometry column exists
    if geometry_column not in gdf.columns:
        raise ValueError(
            f"Geometry column '{geometry_column}' not found in the GeoDataFrame."
        )

    # Check if the specified column contains geometries
    if not gpd.GeoSeries(gdf[geometry_column]).is_valid.all():
        raise TypeError(
            f"Geometry column '{geometry_column}' contains invalid geometries."
        )

    # Simplify geometries (non-destructive)
    gdf_simplified = gdf.copy()
    gdf_simplified[geometry_column] = gdf_simplified[geometry_column].simplify(
        tolerance=tolerance, preserve_topology=preserve_topology
    )

    return gdf_simplified

sat_images

Utilities for satellite imagery processing. Currently focuses on coordinate-to-pixel resolution calculations for Mercator projections.

calculate_pixels_at_location(gdf, resolution, bbox_size=300, crs='EPSG:3857')

Calculate the number of pixels required to cover a bounding box.

Calculates the dimensions in pixels for a given physical size (meters) around a coordinate, accounting for Mercator scale distortion.

Parameters:

Name Type Description Default
gdf

GeoDataFrame with Point geometries (WGS84).

required
resolution

Target resolution in meters per pixel.

required
bbox_size

Bounding box side length in meters.

300
crs

Target projection CRS.

'EPSG:3857'

Returns:

Type Description

Number of pixels per side (width and height).

Source code in gigaspatial/processing/sat_images.py
def calculate_pixels_at_location(gdf, resolution, bbox_size=300, crs="EPSG:3857"):
    """
    Calculate the number of pixels required to cover a bounding box.

    Calculates the dimensions in pixels for a given physical size (meters)
    around a coordinate, accounting for Mercator scale distortion.

    Args:
        gdf: GeoDataFrame with Point geometries (WGS84).
        resolution: Target resolution in meters per pixel.
        bbox_size: Bounding box side length in meters.
        crs: Target projection CRS.

    Returns:
        Number of pixels per side (width and height).
    """

    # Calculate avg lat and lon
    lon = gdf.geometry.x.mean()
    lat = gdf.geometry.y.mean()

    # Define projections
    wgs84 = pyproj.CRS("EPSG:4326")  # Geographic coordinate system
    mercator = pyproj.CRS(crs)  # Target CRS (EPSG:3857)

    # Transform the center coordinate to EPSG:3857
    transformer = pyproj.Transformer.from_crs(wgs84, mercator, always_xy=True)
    x, y = transformer.transform(lon, lat)

    # Calculate scale factor (distortion) at given latitude
    scale_factor = np.cos(np.radians(lat))  # Mercator scale correction

    # Adjust the effective resolution
    effective_resolution = resolution * scale_factor

    # Compute number of pixels per side
    pixels = bbox_size / effective_resolution
    return int(round(pixels))

tif_processor

High-performance Raster (TIF) processing engine. Provides comprehensive tools for TIF data handling, including merging, reprojection, clipping, graph conversion, and memory-efficient sampling. Supports single-band, RGB, RGBA, and multi-band rasters.

TifProcessor

Handler for TIF data processing and analysis.

Supports advanced operations like merging multiple rasters, reprojection, clipping to geometries, and converting raster data to formats like DataFrames or Graphs.

Attributes:

Name Type Description
dataset_path Union[Path, str, List[Union[Path, str]]]

Path(s) to the TIF file(s).

data_store Optional[DataStore]

DataStore instance for file access.

mode Literal['single', 'rgb', 'rgba', 'multi']

Processing mode ('single', 'rgb', 'rgba', 'multi').

merge_method Literal['first', 'last', 'min', 'max', 'mean']

Method for merging multiple rasters.

target_crs Optional[str]

Optional CRS to reproject to.

resampling_method Resampling

Resampling algorithm to use.

reprojection_resolution Optional[Tuple[float, float]]

Target pixel size for reprojection.

Source code in gigaspatial/processing/tif_processor.py
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class TifProcessor:
    """
    Handler for TIF data processing and analysis.

    Supports advanced operations like merging multiple rasters, reprojection,
    clipping to geometries, and converting raster data to formats like
    DataFrames or Graphs.

    Attributes:
        dataset_path: Path(s) to the TIF file(s).
        data_store: DataStore instance for file access.
        mode: Processing mode ('single', 'rgb', 'rgba', 'multi').
        merge_method: Method for merging multiple rasters.
        target_crs: Optional CRS to reproject to.
        resampling_method: Resampling algorithm to use.
        reprojection_resolution: Target pixel size for reprojection.
    """

    dataset_path: Union[Path, str, List[Union[Path, str]]]
    data_store: Optional[DataStore] = None
    mode: Literal["single", "rgb", "rgba", "multi"] = "single"
    merge_method: Literal["first", "last", "min", "max", "mean"] = "first"
    target_crs: Optional[str] = None  # For reprojection if needed
    resampling_method: Resampling = Resampling.nearest
    reprojection_resolution: Optional[Tuple[float, float]] = None

    def __post_init__(self):
        """Validate inputs, merge rasters if needed, and set up logging."""
        self.data_store = self.data_store or LocalDataStore()
        self.logger = config.get_logger(self.__class__.__name__)
        self._cache = {}
        self._temp_dir = tempfile.mkdtemp()
        self._merged_file_path = None
        self._reprojected_file_path = None
        self._clipped_file_path = None

        # Handle multiple dataset paths
        if isinstance(self.dataset_path, list):
            self.dataset_paths = [Path(p) for p in self.dataset_path]
            self._validate_multiple_datasets()
            self._merge_rasters()
            self.dataset_path = self._merged_file_path
        else:
            self.dataset_paths = [Path(self.dataset_path)]
            # For absolute paths with LocalDataStore, check file existence directly
            # to avoid path resolution issues
            if isinstance(self.data_store, LocalDataStore) and os.path.isabs(
                str(self.dataset_path)
            ):
                if not os.path.exists(str(self.dataset_path)):
                    raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
            elif not self.data_store.file_exists(str(self.dataset_path)):
                raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")

            # Reproject single raster during initialization if target_crs is set
            if self.target_crs:
                self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
                with self.data_store.open(str(self.dataset_path), "rb") as f:
                    with rasterio.MemoryFile(f.read()) as memfile:
                        with memfile.open() as src:
                            self._reprojected_file_path = self._reproject_to_temp_file(
                                src, self.target_crs
                            )
                self.dataset_path = self._reprojected_file_path

        self._load_metadata()
        self._validate_mode_band_compatibility()

    @field_validator("dataset_path")
    def validate_dataset_path(cls, value):
        """Validates that at least one dataset path is provided."""
        if isinstance(value, list):
            if path_len := len(value):
                if path_len == 1:
                    return value[0]
                return value

            raise ValueError("No dataset paths provided.")

        if isinstance(value, (Path, str)):
            return value

    @contextmanager
    def open_dataset(self):
        """
        Context manager for robustly accessing the TIF dataset.

        Automatically handles access to original, merged, reprojected, or
        clipped files across different data stores.

        Yields:
            A rasterio.DatasetReader object.
        """
        if self._merged_file_path:
            with rasterio.open(self._merged_file_path) as src:
                yield src
        elif self._reprojected_file_path:
            with rasterio.open(self._reprojected_file_path) as src:
                yield src
        elif self._clipped_file_path:
            with rasterio.open(self._clipped_file_path) as src:
                yield src
        elif isinstance(self.data_store, LocalDataStore):
            with rasterio.open(str(self.dataset_path)) as src:
                yield src
        else:
            with self.data_store.open(str(self.dataset_path), "rb") as f:
                with rasterio.MemoryFile(f.read()) as memfile:
                    with memfile.open() as src:
                        yield src

    def reproject_to(
        self,
        target_crs: str,
        output_path: Optional[Union[str, Path]] = None,
        resampling_method: Optional[Resampling] = None,
        resolution: Optional[Tuple[float, float]] = None,
    ):
        """
        Reproject the current raster to a new CRS.

        Args:
            target_crs: The destination CRS (e.g., "EPSG:4326").
            output_path: Optional path to save the result. If None, saves to temp.
            resampling_method: Optional override for resampling.
            resolution: Optional target pixel resolution (x, y).

        Returns:
            Path to the reprojected file.
        """
        self.logger.info(f"Reprojecting raster to {target_crs}...")

        # Use provided or default values
        resampling_method = resampling_method or self.resampling_method
        resolution = resolution or self.reprojection_resolution

        with self.open_dataset() as src:
            if src.crs.to_string() == target_crs:
                self.logger.info(
                    "Raster is already in the target CRS. No reprojection needed."
                )
                # If output_path is specified, copy the file
                if output_path:
                    self.data_store.copy_file(str(self.dataset_path), output_path)
                return self.dataset_path

            dst_path = output_path or os.path.join(
                self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
            )

            with rasterio.open(
                dst_path,
                "w",
                **self._get_reprojection_profile(src, target_crs, resolution),
            ) as dst:
                for band_idx in range(1, src.count + 1):
                    reproject(
                        source=rasterio.band(src, band_idx),
                        destination=rasterio.band(dst, band_idx),
                        src_transform=src.transform,
                        src_crs=src.crs,
                        dst_transform=dst.transform,
                        dst_crs=dst.crs,
                        resampling=resampling_method,
                        num_threads=multiprocessing.cpu_count(),
                    )

            self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
            return Path(dst_path)

    def get_raster_info(
        self,
        include_statistics: bool = False,
        approx_ok: bool = False,
    ) -> Dict[str, Any]:
        """
        Get comprehensive metadata and statistics for the raster.

        Args:
            include_statistics: Whether to compute pixel statistics (mean, std, etc.).
            approx_ok: Whether to allow approximate statistics for speed.

        Returns:
            Dictionary containing metadata like dimensions, CRS, bounds, and optionally statistics.
        """
        info = {
            "count": self.count,
            "width": self.width,
            "height": self.height,
            "crs": self.crs,
            "bounds": self.bounds,
            "transform": self.transform,
            "dtypes": self.dtype,
            "nodata": self.nodata,
            "mode": self.mode,
            "is_merged": self.is_merged,
            "source_count": self.source_count,
        }

        if include_statistics:
            info["statistics"] = self._get_basic_statistics(approx_ok=approx_ok)

        return info

    def _reproject_to_temp_file(
        self, src: rasterio.DatasetReader, target_crs: str
    ) -> str:
        """Helper to reproject a raster and save it to a temporary file."""
        dst_path = os.path.join(
            self._temp_dir, f"reprojected_temp_{os.urandom(8).hex()}.tif"
        )
        profile = self._get_reprojection_profile(
            src, target_crs, self.reprojection_resolution
        )

        with rasterio.open(dst_path, "w", **profile) as dst:
            for band_idx in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, band_idx),
                    destination=rasterio.band(dst, band_idx),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst.transform,
                    dst_crs=dst.crs,
                    resampling=self.resampling_method,
                )
        return dst_path

    def _validate_multiple_datasets(self):
        """Validate that all datasets exist and have compatible properties."""
        if len(self.dataset_paths) < 2:
            raise ValueError("Multiple dataset paths required for merging")

        with self.data_store.open(str(self.dataset_paths[0]), "rb") as f:
            with rasterio.MemoryFile(f.read()) as memfile:
                with memfile.open() as ref_src:
                    ref_count = ref_src.count
                    ref_dtype = ref_src.dtypes[0]
                    ref_crs = ref_src.crs
                    ref_transform = ref_src.transform
                    ref_nodata = ref_src.nodata

        for i, path in enumerate(self.dataset_paths[1:], 1):
            with self.data_store.open(str(path), "rb") as f:
                with rasterio.MemoryFile(f.read()) as memfile:
                    with memfile.open() as src:
                        if src.count != ref_count:
                            raise ValueError(
                                f"Dataset {i} has {src.count} bands, expected {ref_count}"
                            )
                        if src.dtypes[0] != ref_dtype:
                            raise ValueError(
                                f"Dataset {i} has dtype {src.dtypes[0]}, expected {ref_dtype}"
                            )
                        if not self.target_crs and src.crs != ref_crs:
                            self.logger.warning(
                                f"Dataset {i} has CRS {src.crs}, expected {ref_crs}. "
                                "Consider setting target_crs parameter for reprojection before merging."
                            )
                        if self.target_crs is None and not self._transforms_compatible(
                            src.transform, ref_transform
                        ):
                            self.logger.warning(
                                f"Dataset {i} has different resolution. Resampling may be needed."
                            )
                        if src.nodata != ref_nodata:
                            self.logger.warning(
                                f"Dataset {i} has different nodata value: {src.nodata} vs {ref_nodata}"
                            )

    def _get_reprojection_profile(
        self,
        src: rasterio.DatasetReader,
        target_crs: str,
        resolution: Optional[Tuple[float, float]],
        compression: str = "lzw",
    ):
        """Calculates and returns the profile for a reprojected raster."""
        if resolution:
            src_res = (abs(src.transform.a), abs(src.transform.e))
            self.logger.info(
                f"Using target resolution: {resolution}. Source resolution: {src_res}."
            )
            # Calculate transform and dimensions based on the new resolution
            dst_transform, width, height = calculate_default_transform(
                src.crs,
                target_crs,
                src.width,
                src.height,
                *src.bounds,
                resolution=resolution,
            )
        else:
            # Keep original resolution but reproject
            dst_transform, width, height = calculate_default_transform(
                src.crs, target_crs, src.width, src.height, *src.bounds
            )

        profile = src.profile.copy()
        profile.update(
            {
                "crs": target_crs,
                "transform": dst_transform,
                "width": width,
                "height": height,
                "compress": compression,  # Add compression to save space
            }
        )
        return profile

    def _transforms_compatible(self, transform1, transform2, tolerance=1e-6):
        """Check if two transforms have compatible pixel sizes."""
        return (
            abs(transform1.a - transform2.a) < tolerance
            and abs(transform1.e - transform2.e) < tolerance
        )

    def _merge_rasters(self):
        """Merge multiple rasters into a single raster."""
        self.logger.info(f"Merging {len(self.dataset_paths)} rasters...")

        # Open all datasets and handle reprojection if needed
        datasets_to_merge = []
        temp_reprojected_files = []
        try:
            for path in self.dataset_paths:
                with self.data_store.open(str(path), "rb") as f:
                    with rasterio.MemoryFile(f.read()) as memfile:
                        with memfile.open() as src:
                            if self.target_crs and src.crs != self.target_crs:
                                self.logger.info(
                                    f"Reprojecting {path.name} to {self.target_crs} before merging."
                                )
                                reprojected_path = self._reproject_to_temp_file(
                                    src, self.target_crs
                                )
                                temp_reprojected_files.append(reprojected_path)
                                datasets_to_merge.append(
                                    rasterio.open(reprojected_path)
                                )
                            else:
                                temp_path = os.path.join(
                                    self._temp_dir,
                                    f"temp_{path.stem}_{os.urandom(4).hex()}.tif",
                                )
                                temp_reprojected_files.append(temp_path)

                                profile = src.profile
                                with rasterio.open(temp_path, "w", **profile) as dst:
                                    dst.write(src.read())
                                datasets_to_merge.append(rasterio.open(temp_path))

            self._merged_file_path = os.path.join(self._temp_dir, "merged_raster.tif")

            if self.merge_method == "mean":
                merged_array, merged_transform = self._merge_with_mean(
                    datasets_to_merge
                )
            else:
                merged_array, merged_transform = merge(
                    datasets_to_merge,
                    method=self.merge_method,
                    resampling=self.resampling_method,
                )

            # Get profile from the first file in the list (all should be compatible now)
            ref_src = datasets_to_merge[0]
            profile = ref_src.profile.copy()
            profile.update(
                {
                    "height": merged_array.shape[-2],
                    "width": merged_array.shape[-1],
                    "transform": merged_transform,
                    "crs": self.target_crs if self.target_crs else ref_src.crs,
                }
            )

            with rasterio.open(self._merged_file_path, "w", **profile) as dst:
                dst.write(merged_array)
        finally:
            for dataset in datasets_to_merge:
                if hasattr(dataset, "close"):
                    dataset.close()

            # Clean up temporary files immediately
            for temp_file in temp_reprojected_files:
                try:
                    os.remove(temp_file)
                except OSError:
                    pass

        self.logger.info("Raster merging completed!")

    def _merge_with_mean(self, src_files):
        """Merge rasters using mean aggregation."""
        # Get bounds and resolution for merged raster
        bounds = src_files[0].bounds
        transform = src_files[0].transform

        for src in src_files[1:]:
            bounds = rasterio.coords.BoundingBox(
                min(bounds.left, src.bounds.left),
                min(bounds.bottom, src.bounds.bottom),
                max(bounds.right, src.bounds.right),
                max(bounds.top, src.bounds.top),
            )

        # Calculate dimensions for merged raster
        width = int((bounds.right - bounds.left) / abs(transform.a))
        height = int((bounds.top - bounds.bottom) / abs(transform.e))

        # Create new transform for merged bounds
        merged_transform = rasterio.transform.from_bounds(
            bounds.left, bounds.bottom, bounds.right, bounds.top, width, height
        )

        estimated_memory = height * width * src_files[0].count * 8  # float64
        if estimated_memory > 1e9:  # 1GB threshold
            self.logger.warning(
                f"Large memory usage expected: {estimated_memory/1e9:.1f}GB"
            )

        # Initialize arrays for sum and count
        sum_array = np.zeros((src_files[0].count, height, width), dtype=np.float64)
        count_array = np.zeros((height, width), dtype=np.int32)

        # Process each source file
        for src in src_files:
            # Read data
            data = src.read()

            # Calculate offset in merged raster
            src_bounds = src.bounds
            col_off = int((src_bounds.left - bounds.left) / abs(transform.a))
            row_off = int((bounds.top - src_bounds.top) / abs(transform.e))

            # Get valid data mask
            if src.nodata is not None:
                valid_mask = data[0] != src.nodata
            else:
                valid_mask = np.ones(data[0].shape, dtype=bool)

            # Add to sum and count arrays
            end_row = row_off + data.shape[1]
            end_col = col_off + data.shape[2]

            sum_array[:, row_off:end_row, col_off:end_col] += np.where(
                valid_mask, data, 0
            )
            count_array[row_off:end_row, col_off:end_col] += valid_mask.astype(np.int32)

        # Calculate mean
        mean_array = np.divide(
            sum_array,
            count_array,
            out=np.full_like(
                sum_array, src_files[0].nodata or 0, dtype=sum_array.dtype
            ),
            where=count_array > 0,
        )

        return mean_array.astype(src_files[0].dtypes[0]), merged_transform

    def _load_metadata(self):
        """Load metadata from the TIF file if not already cached"""
        try:
            with self.open_dataset() as src:
                self._cache["transform"] = src.transform
                self._cache["crs"] = src.crs.to_string()
                self._cache["bounds"] = src.bounds
                self._cache["width"] = src.width
                self._cache["height"] = src.height
                self._cache["resolution"] = (abs(src.transform.a), abs(src.transform.e))
                self._cache["x_transform"] = src.transform.a
                self._cache["y_transform"] = src.transform.e
                self._cache["nodata"] = src.nodata
                self._cache["count"] = src.count
                self._cache["dtype"] = src.dtypes[0]
        except (rasterio.errors.RasterioIOError, FileNotFoundError) as e:
            raise FileNotFoundError(f"Could not read raster metadata: {e}")
        except Exception as e:
            raise RuntimeError(f"Unexpected error loading metadata: {e}")

    @property
    def is_merged(self) -> bool:
        """Check if this processor was created from multiple rasters."""
        return len(self.dataset_paths) > 1

    @property
    def source_count(self) -> int:
        """Get the number of source rasters."""
        return len(self.dataset_paths)

    @property
    def transform(self):
        """Get the transform from the TIF file"""
        return self._cache["transform"]

    @property
    def crs(self):
        """Get the coordinate reference system from the TIF file"""
        return self._cache["crs"]

    @property
    def bounds(self):
        """Get the bounds of the TIF file"""
        return self._cache["bounds"]

    @property
    def resolution(self) -> Tuple[float, float]:
        """Get the x and y resolution (pixel width and height or pixel size) from the TIF file"""
        return self._cache["resolution"]

    @property
    def x_transform(self) -> float:
        """Get the x transform from the TIF file"""
        return self._cache["x_transform"]

    @property
    def y_transform(self) -> float:
        """Get the y transform from the TIF file"""
        return self._cache["y_transform"]

    @property
    def count(self) -> int:
        """Get the band count from the TIF file"""
        return self._cache["count"]

    @property
    def nodata(self) -> int:
        """Get the value representing no data in the rasters"""
        return self._cache["nodata"]

    @property
    def dtype(self):
        """Get the data types from the TIF file"""
        return self._cache.get("dtype", [])

    @property
    def width(self):
        return self._cache["width"]

    @property
    def height(self):
        return self._cache["height"]

    def to_dataframe(
        self,
        drop_nodata=True,
        check_memory=True,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Convert the raster data into a pandas DataFrame.

        Args:
            drop_nodata: If True, pixels with the nodata value are excluded.
            check_memory: If True, checks system memory availability before loading.
            min_value: Optional minimum threshold to filter pixels.
            max_value: Optional maximum threshold to filter pixels.
            **kwargs: Additional arguments like `band_number` or `band_names`.

        Returns:
            A DataFrame with 'lon', 'lat', and band values.

        Raises:
            ValueError: If processing fails due to mode mismatch or invalid data.
        """
        # Memory guard check
        if check_memory:
            self._memory_guard("conversion", threshold_percent=80.0)

        try:
            if self.mode == "single":
                return self._to_dataframe(
                    band_number=kwargs.get("band_number", 1),
                    drop_nodata=drop_nodata,
                    band_names=kwargs.get("band_names", None),
                    min_value=min_value,
                    max_value=max_value,
                )
            else:
                return self._to_dataframe(
                    band_number=None,  # All bands
                    drop_nodata=drop_nodata,
                    band_names=kwargs.get("band_names", None),
                    min_value=min_value,
                    max_value=max_value,
                )
        except Exception as e:
            raise ValueError(
                f"Failed to process TIF file in mode '{self.mode}'. "
                f"Please ensure the file is valid and matches the selected mode. "
                f"Original error: {str(e)}"
            )


    def to_geodataframe(
        self,
        check_memory=True,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Convert the raster data into a GeoDataFrame.

        Each row represents a pixel, with a Point or Box geometry representing
        its spatial extent.

        Args:
            check_memory: If True, checks system memory availability.
            min_value: Optional minimum threshold for pixel values.
            max_value: Optional maximum threshold for pixel values.
            **kwargs: Additional arguments passed to `to_dataframe`.

        Returns:
            A GeoDataFrame containing pixel centroids or boxes and their values.
        """
        # Memory guard check
        if check_memory:
            self._memory_guard("conversion", threshold_percent=80.0)

        # Get filtered DataFrame - geometry creation happens AFTER filtering
        df = self.to_dataframe(
            check_memory=False, min_value=min_value, max_value=max_value, **kwargs
        )

        x_res, y_res = self.resolution

        # create bounding box for each pixel
        geometries = [
            box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
            for lon, lat in zip(df["lon"], df["lat"])
        ]

        gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
        return gdf

    def to_dataframe_chunked(
        self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
    ):
        """
        Convert raster to DataFrame using memory-efficient chunked processing.

        Args:
            drop_nodata: Whether to exclude pixels with the nodata value.
            chunk_size: Specific number of rows per chunk. If None, it is auto-calculated.
            target_memory_mb: Target memory limit per chunk in megabytes.
            **kwargs: Additional arguments like `band_number` or `band_names`.

        Returns:
            A consolidated DataFrame containing all processed chunks.
        """

        if chunk_size is None:
            chunk_size = self._calculate_optimal_chunk_size(
                "conversion", target_memory_mb
            )

        windows = self._get_chunk_windows(chunk_size)

        # SIMPLE ROUTING
        if self.mode == "single":
            return self._to_dataframe_chunked(
                windows,
                band_number=kwargs.get("band_number", 1),
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
            )
        else:  # rgb, rgba, multi
            return self._to_dataframe_chunked(
                windows,
                band_number=None,
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
            )

    def clip_to_geometry(
        self,
        geometry: Union[
            Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
        ],
        crop: bool = True,
        all_touched: bool = True,
        invert: bool = False,
        nodata: Optional[Union[int, float]] = None,
        pad: bool = False,
        pad_width: float = 0.5,
        return_clipped_processor: bool = True,
    ) -> Union["TifProcessor", tuple]:
        """
        Clip the raster to the boundaries of specific geometries.

        Args:
            geometry: The geometry to clip to (Polygon, GDF, GeoSeries, etc.).
            crop: If True, the raster's extent is reduced to the geometry's bounding box.
            all_touched: If True, includes all pixels touched by the geometry.
            invert: If True, masks pixels *inside* the geometry.
            nodata: Override for the nodata value in the output.
            pad: Whether to pad the geometry before clipping.
            pad_width: Width of the padding in pixels.
            return_clipped_processor: If True, returns a new TifProcessor instance.

        Returns:
            A new TifProcessor instance (if return_clipped_processor is True) or
            a tuple of (clipped_array, transform, metadata).

        Raises:
            ValueError: If the geometry does not overlap with the raster or CRS is incompatible.
        """
        # Handle different geometry input types
        shapes = self._prepare_geometry_for_clipping(geometry)

        # Validate CRS compatibility
        self._validate_geometry_crs(geometry)

        # Perform the clipping
        with self.open_dataset() as src:
            try:
                clipped_data, clipped_transform = mask(
                    dataset=src,
                    shapes=shapes,
                    crop=crop,
                    all_touched=all_touched,
                    invert=invert,
                    nodata=nodata,
                    pad=pad,
                    pad_width=pad_width,
                    filled=True,
                )

                # Update metadata for the clipped raster
                clipped_meta = src.meta.copy()
                clipped_meta.update(
                    {
                        "height": clipped_data.shape[1],
                        "width": clipped_data.shape[2],
                        "transform": clipped_transform,
                        "nodata": nodata if nodata is not None else src.nodata,
                    }
                )

            except ValueError as e:
                if "Input shapes do not overlap raster" in str(e):
                    raise ValueError(
                        "The geometry does not overlap with the raster. "
                        "Check that both are in the same coordinate reference system."
                    ) from e
                else:
                    raise e

        if return_clipped_processor:
            # Create a new TifProcessor with the clipped data
            return self._create_clipped_processor(clipped_data, clipped_meta)
        else:
            return clipped_data, clipped_transform, clipped_meta

    def clip_to_bounds(
        self,
        bounds: tuple,
        bounds_crs: Optional[str] = None,
        return_clipped_processor: bool = True,
    ) -> Union["TifProcessor", tuple]:
        """
        Clip the raster to a rectangular bounding box.

        Args:
            bounds: Bounding box as (minx, miny, maxx, maxy).
            bounds_crs: The CRS of the input bounds. Defaults to raster CRS.
            return_clipped_processor: If True, returns a new TifProcessor instance.

        Returns:
            The clipped TifProcessor or tuple of data/metadata.
        """
        # Create bounding box geometry
        bbox_geom = box(*bounds)

        # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
        if bounds_crs is not None:
            raster_crs = self.crs

            if not self.crs == bounds_crs:
                # Create GeoDataFrame with bounds CRS and reproject
                bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
                bbox_gdf = bbox_gdf.to_crs(raster_crs)
                bbox_geom = bbox_gdf.geometry.iloc[0]

        return self.clip_to_geometry(
            geometry=bbox_geom,
            crop=True,
            return_clipped_processor=return_clipped_processor,
        )

    def to_graph(
        self,
        connectivity: Literal[4, 8] = 4,
        band: Optional[int] = None,
        include_coordinates: bool = False,
        graph_type: Literal["networkx", "sparse"] = "networkx",
        check_memory: bool = True,
    ) -> Union[nx.Graph, sp.csr_matrix]:
        """
        Convert the raster into a graph representation based on pixel adjacency.

        Args:
            connectivity: Neighborhood connectivity (4 for von Neumann, 8 for Moore).
            band: Band number to use for node values (1-indexed).
            include_coordinates: If True, adds 'x' and 'y' attributes to nodes.
            graph_type: Output type ('networkx' for Graph object, 'sparse' for CSR matrix).
            check_memory: If True, validates memory availability before processing.

        Returns:
            A NetworkX Graph or a SciPy sparse CSR matrix.
        """

        # Memory guard check
        if check_memory:
            self._memory_guard("graph", threshold_percent=80.0)

        with self.open_dataset() as src:
            band_idx = band - 1 if band is not None else 0
            if band_idx < 0 or band_idx >= src.count:
                raise ValueError(
                    f"Band {band} not available. Raster has {src.count} bands"
                )

            data = src.read(band_idx + 1)
            nodata = src.nodata if src.nodata is not None else self.nodata
            valid_mask = (
                data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
            )

            height, width = data.shape

            # Find all valid pixels
            valid_rows, valid_cols = np.where(valid_mask)
            num_valid_pixels = len(valid_rows)

            # Create a sequential mapping from (row, col) to a node ID
            node_map = np.full(data.shape, -1, dtype=int)
            node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)

            # Define neighborhood offsets
            if connectivity == 4:
                # von Neumann neighborhood (4-connectivity)
                offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
            else:  # connectivity == 8
                # Moore neighborhood (8-connectivity)
                offsets = [
                    (-1, -1),
                    (-1, 0),
                    (-1, 1),
                    (0, -1),
                    (0, 1),
                    (1, -1),
                    (1, 0),
                    (1, 1),
                ]

            # Collect nodes and edges
            nodes_to_add = []
            edges_to_add = []

            for i in range(num_valid_pixels):
                row, col = valid_rows[i], valid_cols[i]
                current_node_id = node_map[row, col]

                # Prepare node attributes
                node_attrs = {"value": float(data[row, col])}
                if include_coordinates:
                    x, y = src.xy(row, col)
                    node_attrs["x"] = x
                    node_attrs["y"] = y
                nodes_to_add.append((current_node_id, node_attrs))

                # Find neighbors and collect edges
                for dy, dx in offsets:
                    neighbor_row, neighbor_col = row + dy, col + dx

                    # Check if neighbor is within bounds and is a valid pixel
                    if (
                        0 <= neighbor_row < height
                        and 0 <= neighbor_col < width
                        and valid_mask[neighbor_row, neighbor_col]
                    ):
                        neighbor_node_id = node_map[neighbor_row, neighbor_col]

                        # Ensure each edge is added only once
                        if current_node_id < neighbor_node_id:
                            neighbor_value = float(data[neighbor_row, neighbor_col])
                            edges_to_add.append(
                                (current_node_id, neighbor_node_id, neighbor_value)
                            )

            if graph_type == "networkx":
                G = nx.Graph()
                G.add_nodes_from(nodes_to_add)
                G.add_weighted_edges_from(edges_to_add)
                return G
            else:  # sparse matrix
                edges_array = np.array(edges_to_add)
                row_indices = edges_array[:, 0]
                col_indices = edges_array[:, 1]
                weights = edges_array[:, 2]

                # Add reverse edges for symmetric matrix
                from_idx = np.append(row_indices, col_indices)
                to_idx = np.append(col_indices, row_indices)
                weights = np.append(weights, weights)

                return sp.coo_matrix(
                    (weights, (from_idx, to_idx)),
                    shape=(num_valid_pixels, num_valid_pixels),
                ).tocsr()

    def sample_by_coordinates(
        self, coordinate_list: List[Tuple[float, float]], **kwargs
    ) -> Union[np.ndarray, dict]:
        """
        Extract raster values at specific point coordinates.

        Args:
            coordinate_list: List of (longitude, latitude) tuples.
            **kwargs: Additional arguments passed to rasterio.sample.

        Returns:
            Numpy array of values (single-band) or dict of band values (RGB/RGBA).
        """
        self.logger.info("Sampling raster values at the coordinates...")

        with self.open_dataset() as src:
            if self.mode == "rgba":
                if self.count != 4:
                    raise ValueError("RGBA mode requires a 4-band TIF file")

                rgba_values = {"red": [], "green": [], "blue": [], "alpha": []}

                for band_idx, color in enumerate(["red", "green", "blue", "alpha"], 1):
                    rgba_values[color] = [
                        vals[0]
                        for vals in src.sample(coordinate_list, indexes=band_idx)
                    ]

                return rgba_values

            elif self.mode == "rgb":
                if self.count != 3:
                    raise ValueError("RGB mode requires a 3-band TIF file")

                rgb_values = {"red": [], "green": [], "blue": []}

                for band_idx, color in enumerate(["red", "green", "blue"], 1):
                    rgb_values[color] = [
                        vals[0]
                        for vals in src.sample(coordinate_list, indexes=band_idx)
                    ]

                return rgb_values
            elif self.count > 1:
                return np.array(
                    [vals for vals in src.sample(coordinate_list, **kwargs)]
                )
            else:
                return np.array([vals[0] for vals in src.sample(coordinate_list)])

    def sample_by_polygons(
        self,
        polygon_list,
        stat: Union[str, Callable, List[Union[str, Callable]]] = "mean",
    ):
        """
        Sample raster values within polygons and compute aggregate statistics.

        Args:
            polygon_list: List of Shapely Polygon or MultiPolygon objects.
            stat: Statistic(s) to compute. Can be a string (e.g., 'mean'),
                  a callable, or a list of both.

        Returns:
            Numpy array of results (if single stat) or a list of dictionaries (if multi-stat).
        """
        # Determine if single or multiple stats
        single_stat = not isinstance(stat, list)
        stats_list = [stat] if single_stat else stat

        # Prepare stat functions
        stat_funcs = []
        stat_names = []

        for s in stats_list:
            if callable(s):
                stat_funcs.append(s)
                stat_names.append(
                    s.__name__
                    if hasattr(s, "__name__")
                    else f"custom_{len(stat_names)}"
                )
            else:
                # Handle string statistics
                if s == "count":
                    stat_funcs.append(len)
                else:
                    stat_funcs.append(getattr(np, s))
                stat_names.append(s)

        results = []

        with self.open_dataset() as src:
            for polygon in tqdm(polygon_list):
                try:
                    out_image, _ = mask(src, [polygon], crop=True, filled=False)

                    # Use masked arrays for more efficient nodata handling
                    if hasattr(out_image, "mask"):
                        valid_data = out_image.compressed()
                    else:
                        valid_data = (
                            out_image[out_image != self.nodata]
                            if self.nodata
                            else out_image.flatten()
                        )

                    if len(valid_data) == 0:
                        if single_stat:
                            results.append(np.nan)
                        else:
                            results.append({name: np.nan for name in stat_names})
                    else:
                        if single_stat:
                            results.append(stat_funcs[0](valid_data))
                        else:
                            # Compute all statistics for this polygon
                            polygon_stats = {}
                            for func, name in zip(stat_funcs, stat_names):
                                try:
                                    polygon_stats[name] = func(valid_data)
                                except Exception:
                                    polygon_stats[name] = np.nan
                            results.append(polygon_stats)

                except Exception:
                    if single_stat:
                        results.append(np.nan)
                    else:
                        results.append({name: np.nan for name in stat_names})

        return np.array(results) if single_stat else results

    def sample_by_polygons_batched(
        self,
        polygon_list: List[Union[Polygon, MultiPolygon]],
        stat: Union[str, Callable] = "mean",
        batch_size: int = 100,
        n_workers: int = 4,
        show_progress: bool = True,
        check_memory: bool = True,
        **kwargs,
    ) -> np.ndarray:
        """
        Sample raster values by polygons in parallel using batch processing.

        Efficiently distributes sampling tasks across multiple worker processes.

        Args:
            polygon_list: List of Shapely Polygon or MultiPolygon objects.
            stat: Statistic to compute for each polygon.
            batch_size: Number of polygons to process in each worker batch.
            n_workers: Number of parallel processes to use.
            show_progress: If True, displays a progress bar.
            check_memory: If True, validates memory availability before starting.
            **kwargs: Additional arguments.

        Returns:
            Numpy array of statistics for each polygon.
        """
        import sys

        # Memory guard check with n_workers consideration
        if check_memory:
            is_safe = self._memory_guard(
                "batched_sampling",
                threshold_percent=85.0,
                n_workers=n_workers,
                raise_error=False,
            )

            if not is_safe:
                # Suggest reducing n_workers
                memory_info = self._check_available_memory()
                estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)

                # Calculate optimal workers
                suggested_workers = max(
                    1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
                )

                warnings.warn(
                    f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
                    f"to reduce memory pressure.",
                    ResourceWarning,
                )

        # Platform check
        if sys.platform in ["win32", "darwin"]:
            import warnings
            import multiprocessing as mp

            if mp.get_start_method(allow_none=True) != "fork":
                warnings.warn(
                    "Batched sampling may not work on Windows/macOS. "
                    "Use sample_by_polygons() if you encounter errors.",
                    RuntimeWarning,
                )

        def _chunk_list(data_list, chunk_size):
            """Yield successive chunks from data_list."""
            for i in range(0, len(data_list), chunk_size):
                yield data_list[i : i + chunk_size]

        if len(polygon_list) == 0:
            return np.array([])

        stat_func = stat if callable(stat) else getattr(np, stat)
        polygon_chunks = list(_chunk_list(polygon_list, batch_size))

        with multiprocessing.Pool(
            initializer=self._initializer_worker, processes=n_workers
        ) as pool:
            process_func = partial(self._process_polygon_batch, stat_func=stat_func)
            if show_progress:
                batched_results = list(
                    tqdm(
                        pool.imap(process_func, polygon_chunks),
                        total=len(polygon_chunks),
                        desc=f"Sampling polygons",
                    )
                )
            else:
                batched_results = list(pool.imap(process_func, polygon_chunks))

            results = [item for sublist in batched_results for item in sublist]

        return np.array(results)

    def _initializer_worker(self):
        """
        Initializer function for each worker process.
        Opens the raster dataset and stores it in a process-local variable.
        This function runs once per worker, not for every task.
        """
        global src_handle, memfile_handle

        # Priority: merged > reprojected > original (same as open_dataset)
        local_file_path = None
        if self._merged_file_path:
            # Merged file is a local temp file
            local_file_path = self._merged_file_path
        elif self._reprojected_file_path:
            # Reprojected file is a local temp file
            local_file_path = self._reprojected_file_path
        elif isinstance(self.data_store, LocalDataStore):
            # Local file - can open directly
            local_file_path = str(self.dataset_path)

        if local_file_path:
            # Open local file directly
            with open(local_file_path, "rb") as f:
                memfile_handle = rasterio.MemoryFile(f.read())
                src_handle = memfile_handle.open()
        else:
            # Custom DataStore
            with self.data_store.open(str(self.dataset_path), "rb") as f:
                memfile_handle = rasterio.MemoryFile(f.read())
                src_handle = memfile_handle.open()

    def _get_worker_dataset(self):
        """Get dataset handle for worker process."""
        global src_handle
        if src_handle is None:
            raise RuntimeError("Raster dataset not initialized in this process.")
        return src_handle

    def _process_single_polygon(self, polygon, stat_func):
        """
        Helper function to process a single polygon.
        This will be run in a separate process.
        """
        try:
            src = self._get_worker_dataset()
            out_image, _ = mask(src, [polygon], crop=True, filled=False)

            if hasattr(out_image, "mask"):
                valid_data = out_image.compressed()
            else:
                valid_data = (
                    out_image[out_image != self.nodata]
                    if self.nodata
                    else out_image.flatten()
                )

            return stat_func(valid_data) if len(valid_data) > 0 else np.nan
        except RuntimeError as e:
            self.logger.error(f"Worker not initialized: {e}")
            return np.nan
        except Exception as e:
            self.logger.debug(f"Error processing polygon: {e}")
            return np.nan

    def _process_polygon_batch(self, polygon_batch, stat_func):
        """
        Processes a batch of polygons.
        """
        return [
            self._process_single_polygon(polygon, stat_func)
            for polygon in polygon_batch
        ]

    def _to_dataframe(
        self,
        band_number: Optional[int] = None,
        drop_nodata: bool = True,
        band_names: Optional[Union[str, List[str]]] = None,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
    ) -> pd.DataFrame:
        """
        Process TIF to DataFrame - handles both single-band and multi-band.

        Args:
            band_number: Specific band to read (1-indexed). If None, reads all bands.
            drop_nodata: Whether to drop nodata values
            band_names: Custom names for bands (multi-band only)
            min_value: Minimum threshold for pixel values (exclusive)
            max_value: Maximum threshold for pixel values (exclusive)

        Returns:
            pd.DataFrame with lon, lat, and band value(s) filtered according to drop_nodata, min_value and max_value
        """
        with self.open_dataset() as src:
            if band_number is not None:
                # SINGLE BAND MODE
                band = src.read(band_number)
                nodata_value = src.nodata if src.nodata is not None else self.nodata

                # Build mask combining nodata and value thresholds
                mask = self._build_data_mask(
                    band, drop_nodata, nodata_value, min_value, max_value
                )

                # Extract coordinates and values with mask
                lons, lats = self._extract_coordinates_with_mask(mask)
                values = np.extract(mask, band) if mask is not None else band.flatten()

                band_name = (
                    band_names
                    if isinstance(band_names, str)
                    else (
                        band_names[band_number]
                        if isinstance(band_names, list)
                        else "pixel_value"
                    )
                )

                return pd.DataFrame({"lon": lons, "lat": lats, band_name: values})
            else:
                # MULTI-BAND MODE (all bands)
                stack = src.read()
                nodata_value = src.nodata if src.nodata is not None else self.nodata

                # Auto-detect band names by mode
                if band_names is None:
                    if self.mode == "rgb":
                        band_names = ["red", "green", "blue"]
                    elif self.mode == "rgba":
                        band_names = ["red", "green", "blue", "alpha"]
                    else:
                        band_names = [
                            src.descriptions[i] or f"band_{i+1}"
                            for i in range(self.count)
                        ]
                # Build mask combining nodata and value thresholds
                mask = self._build_multi_band_mask(
                    stack, drop_nodata, nodata_value, min_value, max_value
                )

                # Create DataFrame
                data_dict = self._bands_to_dict(stack, self.count, band_names, mask)
                df = pd.DataFrame(data_dict)

                # RGBA: normalize alpha if needed
                if (
                    self.mode == "rgba"
                    and "alpha" in df.columns
                    and df["alpha"].max() > 1
                ):
                    df["alpha"] = df["alpha"] / 255.0

            return df

    def _to_dataframe_chunked(
        self,
        windows: List[rasterio.windows.Window],
        band_number: Optional[int] = None,
        drop_nodata: bool = True,
        band_names: Optional[Union[str, List[str]]] = None,
        show_progress: bool = True,
    ) -> pd.DataFrame:
        """Universal chunked converter for ALL modes."""

        chunks = []
        iterator = tqdm(windows, desc="Processing chunks") if show_progress else windows

        with self.open_dataset() as src:
            # Auto-detect band names ONCE (before loop)
            if band_number is None and band_names is None:
                if self.mode == "rgb":
                    band_names = ["red", "green", "blue"]
                elif self.mode == "rgba":
                    band_names = ["red", "green", "blue", "alpha"]
                else:  # multi
                    band_names = [
                        src.descriptions[i] or f"band_{i+1}" for i in range(self.count)
                    ]

            for window in iterator:
                if band_number is not None:
                    # SINGLE BAND
                    band_chunk = src.read(band_number, window=window)
                    mask = self._build_data_mask(band_chunk, drop_nodata, src.nodata)
                    lons, lats = self._get_chunk_coordinates(window, src)
                    band_name = (
                        band_names if isinstance(band_names, str) else "pixel_value"
                    )

                    # Build chunk DataFrame (could use helper but simple enough)
                    if mask is not None:
                        mask_flat = mask.flatten()
                        chunk_df = pd.DataFrame(
                            {
                                "lon": lons[mask_flat],
                                "lat": lats[mask_flat],
                                band_name: band_chunk.flatten()[mask_flat],
                            }
                        )
                    else:
                        chunk_df = pd.DataFrame(
                            {"lon": lons, "lat": lats, band_name: band_chunk.flatten()}
                        )
                else:
                    # MULTI-BAND (includes RGB/RGBA)
                    stack_chunk = src.read(window=window)
                    mask = self._build_multi_band_mask(
                        stack_chunk, drop_nodata, src.nodata
                    )
                    lons, lats = self._get_chunk_coordinates(window, src)

                    # Build DataFrame using helper
                    band_dict = {
                        band_names[i]: stack_chunk[i] for i in range(self.count)
                    }
                    chunk_df = self._build_chunk_dataframe(lons, lats, band_dict, mask)

                    # RGBA: normalize alpha
                    if self.mode == "rgba" and "alpha" in chunk_df.columns:
                        if chunk_df["alpha"].max() > 1:
                            chunk_df["alpha"] = chunk_df["alpha"] / 255.0

                chunks.append(chunk_df)

        result = pd.concat(chunks, ignore_index=True)
        return result

    def _prepare_geometry_for_clipping(
        self,
        geometry: Union[
            Polygon,
            MultiPolygon,
            MultiPoint,
            gpd.GeoDataFrame,
            gpd.GeoSeries,
            List[dict],
            dict,
        ],
    ) -> List[dict]:
        """Convert various geometry formats to list of GeoJSON-like dicts for rasterio.mask"""

        if isinstance(geometry, MultiPoint):
            # Use bounding box of MultiPoint
            minx, miny, maxx, maxy = geometry.bounds
            bbox = box(minx, miny, maxx, maxy)
            return [bbox.__geo_interface__]

        if isinstance(geometry, (Polygon, MultiPolygon)):
            # Shapely geometry
            return [geometry.__geo_interface__]

        elif isinstance(geometry, gpd.GeoDataFrame):
            # GeoDataFrame - use all geometries
            return [
                geom.__geo_interface__ for geom in geometry.geometry if geom is not None
            ]

        elif isinstance(geometry, gpd.GeoSeries):
            # GeoSeries
            return [geom.__geo_interface__ for geom in geometry if geom is not None]

        elif isinstance(geometry, dict):
            # Single GeoJSON-like dict
            return [geometry]

        elif isinstance(geometry, list):
            # List of GeoJSON-like dicts
            return geometry

        else:
            raise TypeError(
                f"Unsupported geometry type: {type(geometry)}. "
                "Supported types: Shapely geometries, GeoDataFrame, GeoSeries, "
                "GeoJSON-like dict, or list of GeoJSON-like dicts."
            )

    def _validate_geometry_crs(
        self,
        original_geometry: Any,
    ) -> None:
        """Validate that geometry CRS matches raster CRS"""

        # Get raster CRS
        raster_crs = self.crs

        # Try to get geometry CRS
        geometry_crs = None

        if isinstance(original_geometry, (gpd.GeoDataFrame, gpd.GeoSeries)):
            geometry_crs = original_geometry.crs
        elif hasattr(original_geometry, "crs"):
            geometry_crs = original_geometry.crs

        # Warn if CRS mismatch detected
        if geometry_crs is not None and raster_crs is not None:
            if not raster_crs == geometry_crs:
                self.logger.warning(
                    f"CRS mismatch detected! Raster CRS: {raster_crs}, "
                    f"Geometry CRS: {geometry_crs}. "
                    "Consider reprojecting geometry to match raster CRS for accurate clipping."
                )

    def _create_clipped_processor(
        self, clipped_data: np.ndarray, clipped_meta: dict
    ) -> "TifProcessor":
        """
        Helper to create a new TifProcessor instance from clipped data.
        Saves the clipped data to a temporary file and initializes a new TifProcessor.
        """
        # Create a temporary placeholder file to initialize the processor
        # This allows us to get the processor's temp_dir
        placeholder_dir = tempfile.mkdtemp()
        placeholder_path = os.path.join(
            placeholder_dir, f"placeholder_{os.urandom(8).hex()}.tif"
        )

        # Create a minimal valid TIF file as placeholder
        placeholder_transform = rasterio.transform.from_bounds(0, 0, 1, 1, 1, 1)
        with rasterio.open(
            placeholder_path,
            "w",
            driver="GTiff",
            width=1,
            height=1,
            count=1,
            dtype="uint8",
            crs="EPSG:4326",
            transform=placeholder_transform,
        ) as dst:
            dst.write(np.zeros((1, 1, 1), dtype="uint8"))

        # Create a new TifProcessor instance with the placeholder
        # ALWAYS use LocalDataStore() for local temp paths, even if self.data_store is different
        new_processor = TifProcessor(
            dataset_path=placeholder_path,
            data_store=LocalDataStore(),
            mode=self.mode,
        )

        # Now save the clipped file directly to the new processor's temp directory
        clipped_file_path = os.path.join(
            new_processor._temp_dir, f"clipped_{os.urandom(8).hex()}.tif"
        )

        with rasterio.open(clipped_file_path, "w", **clipped_meta) as dst:
            dst.write(clipped_data)

        # Verify file was created successfully
        if not os.path.exists(clipped_file_path):
            raise RuntimeError(f"Failed to create clipped file at {clipped_file_path}")

        # Set the clipped file path and update processor attributes
        new_processor._clipped_file_path = clipped_file_path
        new_processor.dataset_path = clipped_file_path
        new_processor.dataset_paths = [Path(clipped_file_path)]

        # Restore original data_store to the new processor
        new_processor.data_store = self.data_store

        # Clean up placeholder file and directory
        try:
            os.remove(placeholder_path)
            os.rmdir(placeholder_dir)
        except OSError:
            pass

        # Reload metadata since the path changed
        new_processor._load_metadata()

        return new_processor


    def _get_basic_statistics(self, approx_ok: bool = False) -> Dict[str, Any]:
        """
        Compute per-band statistics (min, max, mean, std, sum, count).

        Args:
            approx_ok: Whether to allow approximate statistics.

        Returns:
            Dictionary containing per-band and overall statistics.
        """
        cache_key = "statistics_exact"
        if cache_key in self._cache:
            return self._cache[cache_key]

        if approx_ok:
            self.logger.debug(
                "approx_ok requested for statistics, but only exact statistics are supported."
            )

        band_stats: List[Dict[str, Union[int, float, None]]] = []
        overall = {
            "min": None,
            "max": None,
            "mean": None,
            "std": None,
            "sum": 0.0,
            "count": 0,
        }

        with self.open_dataset() as src:
            nodata_value = src.nodata if src.nodata is not None else self.nodata
            total_sum = 0.0
            total_sq_sum = 0.0
            total_count = 0

            for band_idx in range(1, src.count + 1):
                band_min = None
                band_max = None
                band_sum = 0.0
                band_sq_sum = 0.0
                band_count = 0

                for _, window in src.block_windows(bidx=band_idx):
                    block = src.read(band_idx, window=window, masked=False)

                    if nodata_value is not None:
                        valid_mask = block != nodata_value
                        if not np.any(valid_mask):
                            continue
                        valid = block[valid_mask]
                    else:
                        valid = block

                    valid = valid.astype(np.float64, copy=False)
                    if valid.size == 0:
                        continue

                    block_min = float(valid.min())
                    block_max = float(valid.max())
                    block_sum = float(valid.sum())
                    block_sq_sum = float(np.square(valid, dtype=np.float64).sum())
                    block_count = int(valid.size)

                    band_min = (
                        block_min if band_min is None else min(band_min, block_min)
                    )
                    band_max = (
                        block_max if band_max is None else max(band_max, block_max)
                    )
                    band_sum += block_sum
                    band_sq_sum += block_sq_sum
                    band_count += block_count

                if band_count == 0:
                    band_stats.append(
                        {
                            "band": band_idx,
                            "min": None,
                            "max": None,
                            "mean": None,
                            "std": None,
                            "sum": 0.0,
                            "count": 0,
                        }
                    )
                    continue

                band_mean = band_sum / band_count
                variance = max((band_sq_sum / band_count) - band_mean**2, 0.0)
                band_std = variance**0.5

                band_stats.append(
                    {
                        "band": band_idx,
                        "min": band_min,
                        "max": band_max,
                        "mean": band_mean,
                        "std": band_std,
                        "sum": band_sum,
                        "count": band_count,
                    }
                )

                overall["min"] = (
                    band_min
                    if overall["min"] is None
                    else min(overall["min"], band_min)
                )
                overall["max"] = (
                    band_max
                    if overall["max"] is None
                    else max(overall["max"], band_max)
                )
                total_sum += band_sum
                total_sq_sum += band_sq_sum
                total_count += band_count

            if total_count > 0:
                overall["sum"] = total_sum
                overall["count"] = total_count
                overall["mean"] = total_sum / total_count
                overall_variance = max(
                    (total_sq_sum / total_count) - overall["mean"] ** 2, 0.0
                )
                overall["std"] = overall_variance**0.5

        result = {
            "bands": band_stats,
            "overall": overall,
            "approximate": False,
        }

        self._cache[cache_key] = result
        return result

    def _get_pixel_coordinates(self):
        """Helper method to generate coordinate arrays for all pixels"""
        if "pixel_coords" not in self._cache:
            # use cached values
            bounds = self._cache["bounds"]
            width = self._cache["width"]
            height = self._cache["height"]
            pixel_size_x = self._cache["x_transform"]
            pixel_size_y = self._cache["y_transform"]

            self._cache["pixel_coords"] = np.meshgrid(
                np.linspace(
                    bounds.left + pixel_size_x / 2,
                    bounds.right - pixel_size_x / 2,
                    width,
                ),
                np.linspace(
                    bounds.top + pixel_size_y / 2,
                    bounds.bottom - pixel_size_y / 2,
                    height,
                ),
            )

        return self._cache["pixel_coords"]

    def _get_chunk_coordinates(self, window, src):
        """Get coordinates for a specific window chunk."""
        transform = src.window_transform(window)
        rows, cols = np.meshgrid(
            np.arange(window.height), np.arange(window.width), indexing="ij"
        )
        xs, ys = rasterio.transform.xy(transform, rows.flatten(), cols.flatten())
        return np.array(xs), np.array(ys)

    def _extract_coordinates_with_mask(self, mask=None):
        """Extract flattened coordinates, optionally applying a mask."""
        x_coords, y_coords = self._get_pixel_coordinates()

        if mask is not None:
            return np.extract(mask, x_coords), np.extract(mask, y_coords)

        return x_coords.flatten(), y_coords.flatten()

    def _build_data_mask(
        self,
        data: np.ndarray,
        drop_nodata: bool = True,
        nodata_value: Optional[float] = None,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
    ) -> Optional[np.ndarray]:
        """
        Build a boolean mask for filtering data based on nodata and value thresholds.

        Args:
            Data array to mask
            drop_no Whether to drop nodata values
            nodata_value: The nodata value to filter
            min_value: Minimum value threshold (exclusive)
            max_value: Maximum value threshold (exclusive)

        Returns:
            Boolean mask or None if no masking needed
        """
        masks = []

        # Nodata mask
        if drop_nodata and nodata_value is not None:
            masks.append(data != nodata_value)

        # Min value threshold
        if min_value is not None:
            masks.append(data > min_value)

        # Max value threshold
        if max_value is not None:
            masks.append(data < max_value)

        if not masks:
            return None

        # Combine all masks with AND logic
        combined_mask = masks[0]
        for mask in masks[1:]:
            combined_mask &= mask

        return combined_mask

    def _build_multi_band_mask(
        self,
        bands: np.ndarray,
        drop_nodata: bool = True,
        nodata_value: Optional[float] = None,
        min_value: Optional[float] = None,
        max_value: Optional[float] = None,
    ) -> Optional[np.ndarray]:
        """
        Build mask for multi-band data.

        Drops pixels where ANY band has nodata or fails value thresholds.

        Args:
            bands: 3D array of shape (nbands, height, width)
            drop_no Whether to drop nodata values
            nodata_value: The nodata value to check
            min_value: Minimum value threshold (exclusive)
            max_value: Maximum value threshold (exclusive)

        Returns:
            Boolean mask or None if no masking needed
        """
        masks = []

        # Nodata mask - any band has nodata
        if drop_nodata and nodata_value is not None:
            has_nodata = np.any(bands == nodata_value, axis=0)
            masks.append(~has_nodata)

        # Value threshold masks - any band fails threshold
        if min_value is not None:
            below_min = np.any(bands <= min_value, axis=0)
            masks.append(~below_min)

        if max_value is not None:
            above_max = np.any(bands >= max_value, axis=0)
            masks.append(~above_max)

        if not masks:
            return None

        # Combine all masks with AND logic
        combined_mask = masks[0]
        for mask in masks[1:]:
            combined_mask &= mask

        return combined_mask

    def _bands_to_dict(self, bands, band_count, band_names, mask=None):
        """Read specified bands and return as a dictionary with optional masking."""

        lons, lats = self._extract_coordinates_with_mask(mask)
        data_dict = {"lon": lons, "lat": lats}

        for idx, name in enumerate(band_names[:band_count]):
            band_data = bands[idx]
            data_dict[name] = (
                np.extract(mask, band_data) if mask is not None else band_data.flatten()
            )

        return data_dict

    def _calculate_optimal_chunk_size(
        self, operation: str = "conversion", target_memory_mb: int = 500
    ) -> int:
        """
        Calculate optimal chunk size based on target memory usage.

        Args:
            operation: Type of operation ('conversion', 'graph').
            target_memory_mb: Target memory per chunk in megabytes.

        Returns:
            Optimal number of rows per chunk.
        """
        bytes_per_element = np.dtype(self.dtype).itemsize
        n_bands = self.count
        width = self.width

        # Adjust for operation type
        if operation == "conversion":
            # DataFrame overhead is roughly 2x
            bytes_per_row = width * n_bands * bytes_per_element * 2
        elif operation == "graph":
            # Graph needs additional space for edges
            bytes_per_row = width * bytes_per_element * 4  # Estimate
        else:
            bytes_per_row = width * n_bands * bytes_per_element

        target_bytes = target_memory_mb * 1024 * 1024
        chunk_rows = max(1, int(target_bytes / bytes_per_row))

        # Ensure chunk size doesn't exceed total height
        chunk_rows = min(chunk_rows, self.height)

        self.logger.info(
            f"Calculated chunk size: {chunk_rows} rows "
            f"(~{self._format_bytes(chunk_rows * bytes_per_row)} per chunk)"
        )

        return chunk_rows

    def _get_chunk_windows(self, chunk_size: int) -> List[rasterio.windows.Window]:
        """
        Generate window objects for chunked reading.

        Args:
            chunk_size: Number of rows per chunk

        Returns:
            List of rasterio.windows.Window objects
        """
        windows = []
        for row_start in range(0, self.height, chunk_size):
            row_end = min(row_start + chunk_size, self.height)
            window = rasterio.windows.Window(
                col_off=0,
                row_off=row_start,
                width=self.width,
                height=row_end - row_start,
            )
            windows.append(window)

        return windows

    def _format_bytes(self, bytes_value: int) -> str:
        """Convert bytes to human-readable format."""
        for unit in ["B", "KB", "MB", "GB", "TB"]:
            if bytes_value < 1024.0:
                return f"{bytes_value:.2f} {unit}"
            bytes_value /= 1024.0
        return f"{bytes_value:.2f} PB"

    def _check_available_memory(self) -> dict:
        """
        Check available system memory.

        Returns:
            Dict with total, available, and used memory info
        """
        import psutil

        memory = psutil.virtual_memory()
        return {
            "total": memory.total,
            "available": memory.available,
            "used": memory.used,
            "percent": memory.percent,
            "available_human": self._format_bytes(memory.available),
        }

    def _estimate_memory_usage(
        self, operation: str = "conversion", n_workers: int = 1
    ) -> dict:
        """
        Estimate memory usage for various operations.

        Args:
            operation: Type of operation ('conversion', 'batched_sampling', 'merge', 'graph')
            n_workers: Number of workers (for batched_sampling)

        Returns:
            Dict with estimated memory usage in bytes and human-readable format
        """
        bytes_per_element = np.dtype(self.dtype).itemsize
        n_pixels = self.width * self.height
        n_bands = self.count

        estimates = {}

        if operation == "conversion":
            # to_dataframe/to_geodataframe: full raster + DataFrame overhead
            raster_memory = n_pixels * n_bands * bytes_per_element
            # DataFrame overhead (roughly 2x for storage + processing)
            dataframe_memory = (
                n_pixels * n_bands * 16
            )  # 16 bytes per value in DataFrame
            total = raster_memory + dataframe_memory
            estimates["raster"] = raster_memory
            estimates["dataframe"] = dataframe_memory
            estimates["total"] = total

        elif operation == "batched_sampling":
            # Each worker loads full raster into MemoryFile
            # Need to get file size
            if self._merged_file_path:
                file_path = self._merged_file_path
            elif self._reprojected_file_path:
                file_path = self._reprojected_file_path
            else:
                file_path = str(self.dataset_path)

            try:
                import os

                file_size = os.path.getsize(file_path)
            except:
                # Estimate if can't get file size
                file_size = n_pixels * n_bands * bytes_per_element * 1.2  # Add overhead

            estimates["per_worker"] = file_size
            estimates["total"] = file_size * n_workers

        elif operation == "merge":
            # _merge_with_mean uses float64 arrays
            raster_memory = n_pixels * n_bands * 8  # float64
            estimates["sum_array"] = raster_memory
            estimates["count_array"] = n_pixels * 4  # int32
            estimates["total"] = raster_memory + n_pixels * 4

        elif operation == "graph":
            # to_graph: data + node_map + edges
            data_memory = n_pixels * bytes_per_element
            node_map_memory = n_pixels * 4  # int32
            # Estimate edges (rough: 4-connectivity = 4 edges per pixel)
            edges_memory = n_pixels * 4 * 3 * 8  # 3 values per edge, float64
            total = data_memory + node_map_memory + edges_memory
            estimates["data"] = data_memory
            estimates["node_map"] = node_map_memory
            estimates["edges"] = edges_memory
            estimates["total"] = total

        # Add human-readable format
        estimates["human_readable"] = self._format_bytes(estimates["total"])

        return estimates

    def _memory_guard(
        self,
        operation: str,
        threshold_percent: float = 80.0,
        n_workers: Optional[int] = None,
        raise_error: bool = False,
    ) -> bool:
        """
        Check if an operation is safe to perform given memory constraints.

        Args:
            operation: Type of operation to check.
            threshold_percent: Maximum % of available memory to use.
            n_workers: Number of workers (for parallel operations).
            raise_error: If True, raises MemoryError if unsafe.

        Returns:
            True if the operation is deemed safe, False otherwise.

        Raises:
            MemoryError: If raise_error is True and memory is insufficient.
        """
        import warnings

        estimates = self._estimate_memory_usage(operation, n_workers=n_workers or 1)
        memory_info = self._check_available_memory()

        estimated_usage = estimates["total"]
        available = memory_info["available"]
        threshold = available * (threshold_percent / 100.0)

        is_safe = estimated_usage <= threshold

        if not is_safe:
            usage_str = self._format_bytes(estimated_usage)
            available_str = memory_info["available_human"]

            message = (
                f"Memory warning: {operation} operation may require {usage_str} "
                f"but only {available_str} is available. "
                f"Current memory usage: {memory_info['percent']:.1f}%"
            )

            if raise_error:
                raise MemoryError(message)
            else:
                warnings.warn(message, ResourceWarning)
                if hasattr(self, "logger"):
                    self.logger.warning(message)

        return is_safe

    def _validate_mode_band_compatibility(self):
        """Validate that mode matches band count."""
        mode_requirements = {
            "single": (1, "1-band"),
            "rgb": (3, "3-band"),
            "rgba": (4, "4-band"),
        }

        if self.mode in mode_requirements:
            required_count, description = mode_requirements[self.mode]
            if self.count != required_count:
                raise ValueError(
                    f"{self.mode.upper()} mode requires a {description} TIF file"
                )
        elif self.mode == "multi" and self.count < 2:
            raise ValueError("Multi mode requires a TIF file with 2 or more bands")

    def save_to_file(
        self,
        output_path: Union[str, Path],
        compress: Optional[str] = "LZW",
        tiled: bool = True,
        blocksize: int = 512,
        bigtiff: Optional[str] = None,
        predictor: Optional[int] = None,
        num_threads: Optional[int] = None,
        cog: bool = False,
        overviews: Optional[List[int]] = None,
        overview_resampling: str = "nearest",
        **kwargs,
    ) -> Path:
        """
        Export the raster to a file with optimized settings.

        Args:
            output_path: Output file path.
            compress: Compression method (e.g., 'LZW', 'ZSTD').
            tiled: If True, tiles the output for better performance.
            blocksize: Block size for tiled output.
            bigtiff: 'YES', 'NO', or 'IF_NEEDED' for large files.
            predictor: Compression predictor (2 for int, 3 for float).
            num_threads: Number of threads for compression.
            cog: If True, creates a Cloud-Optimized GeoTIFF.
            overviews: Overview levels for COG.
            overview_resampling: Resampling method for overviews.
            **kwargs: Additional creation options for rasterio.

        Returns:
            Path to the saved TIF file.
        """
        output_path = Path(output_path)

        # Build creation options
        creation_options = {}

        if compress and compress.upper() != "NONE":
            creation_options["compress"] = compress.upper()

        if tiled:
            creation_options["tiled"] = True
            creation_options["blockxsize"] = blocksize
            creation_options["blockysize"] = blocksize

        if bigtiff:
            creation_options["BIGTIFF"] = bigtiff

        if predictor is not None:
            creation_options["predictor"] = predictor

        if num_threads is not None:
            creation_options["NUM_THREADS"] = num_threads

        # Add compression-specific options
        if compress:
            if compress.upper() == "DEFLATE" and "ZLEVEL" not in kwargs:
                kwargs["ZLEVEL"] = 6  # Default compression level
            elif compress.upper() == "ZSTD" and "ZSTD_LEVEL" not in kwargs:
                kwargs["ZSTD_LEVEL"] = 9  # Default compression level
            elif compress.upper() == "JPEG" and "JPEG_QUALITY" not in kwargs:
                kwargs["JPEG_QUALITY"] = 85  # Default quality
            elif compress.upper() == "WEBP" and "WEBP_LEVEL" not in kwargs:
                kwargs["WEBP_LEVEL"] = 75  # Default quality

        # Merge additional kwargs
        creation_options.update(kwargs)

        # Write to temporary file first (rasterio requires local file)
        with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            # Use open_dataset context manager - handles merged/reprojected/clipped files automatically
            with self.open_dataset() as src:
                profile = src.profile.copy()
                profile.update(**creation_options)

                with rasterio.open(tmp_path, "w", **profile) as dst:
                    # Write all bands
                    for band_idx in range(1, src.count + 1):
                        data = src.read(band_idx)
                        dst.write(data, band_idx)

                    # Add overviews if requested
                    if overviews or cog:
                        if overviews is None:
                            # Auto-generate overview levels for COG
                            overviews = [2, 4, 8, 16]
                        dst.build_overviews(
                            overviews, getattr(Resampling, overview_resampling)
                        )

                    # Update tags to indicate COG if requested
                    if cog:
                        dst.update_tags(LAYOUT="COG")

            # Write through data store
            with open(tmp_path, "rb") as f:
                file_content = f.read()

            self.data_store.write_file(str(output_path), file_content)

            self.logger.info(f"Raster saved to {output_path}")

        finally:
            # Clean up temporary file
            try:
                os.remove(tmp_path)
            except OSError:
                pass

        return output_path

    def save_array_to_file(
        self,
        array: np.ndarray,
        output_path: Union[str, Path],
        compress: Optional[str] = "LZW",
        tiled: bool = True,
        blocksize: int = 512,
        crs: Optional[Any] = None,
        transform: Optional[Any] = None,
        nodata: Optional[float] = None,
        **kwargs,
    ) -> Path:
        """
        Save a numpy array to a raster file using metadata from this processor.

        Args:
            array: 2D or 3D array of data to save.
            output_path: Destination file path.
            compress: Compression method.
            tiled: If True, tiles the output.
            blocksize: Block size for tiled output.
            crs: Optional CRS override.
            transform: Optional Affine transform override.
            nodata: Optional nodata value override.
            **kwargs: Additional creation options.

        Returns:
            Path to the saved TIF file.
        """
        output_path = Path(output_path)

        # Ensure array is at least 3D
        if array.ndim == 2:
            array = array[np.newaxis, :, :]
        elif array.ndim != 3:
            raise ValueError(f"Array must be 2D or 3D, got shape {array.shape}")

        num_bands = array.shape[0]
        height = array.shape[1]
        width = array.shape[2]

        # Get metadata from source using open_dataset
        with self.open_dataset() as src:
            if crs is None:
                crs = src.crs
            if transform is None:
                transform = src.transform
            if nodata is None:
                nodata = src.nodata
            dtype = array.dtype

        # Build profile
        profile = {
            "driver": "GTiff",
            "height": height,
            "width": width,
            "count": num_bands,
            "dtype": dtype,
            "crs": crs,
            "transform": transform,
            "nodata": nodata,
        }

        # Add creation options
        if compress and compress.upper() != "NONE":
            profile["compress"] = compress.upper()
        if tiled:
            profile["tiled"] = True
            profile["blockxsize"] = blocksize
            profile["blockysize"] = blocksize

        profile.update(kwargs)

        # Write to temporary file first
        with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
            tmp_path = tmp.name

        try:
            # Write the file - write all bands
            with rasterio.open(tmp_path, "w", **profile) as dst:
                for band_idx in range(num_bands):
                    dst.write(array[band_idx], band_idx + 1)

            # Write through data store
            with open(tmp_path, "rb") as f:
                file_content = f.read()

            self.data_store.write_file(str(output_path), file_content)

            self.logger.info(f"Array saved to {output_path}")

        finally:
            # Clean up temporary file
            try:
                os.remove(tmp_path)
            except OSError:
                pass

        return output_path

    def __enter__(self):
        return self

    def __del__(self):
        """Clean up temporary files and directories."""
        if (
            hasattr(self, "_temp_dir")
            and self._temp_dir
            and os.path.exists(self._temp_dir)
        ):
            shutil.rmtree(self._temp_dir, ignore_errors=True)

    def cleanup(self):
        """Explicit cleanup method for better control."""
        if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
            shutil.rmtree(self._temp_dir)
            self.logger.info("Cleaned up temporary files")

    def __exit__(self, *args):
        """Proper context manager exit with cleanup."""
        self.cleanup()
bounds property

Get the bounds of the TIF file

count: int property

Get the band count from the TIF file

crs property

Get the coordinate reference system from the TIF file

dtype property

Get the data types from the TIF file

is_merged: bool property

Check if this processor was created from multiple rasters.

nodata: int property

Get the value representing no data in the rasters

resolution: Tuple[float, float] property

Get the x and y resolution (pixel width and height or pixel size) from the TIF file

source_count: int property

Get the number of source rasters.

transform property

Get the transform from the TIF file

x_transform: float property

Get the x transform from the TIF file

y_transform: float property

Get the y transform from the TIF file

__del__()

Clean up temporary files and directories.

Source code in gigaspatial/processing/tif_processor.py
def __del__(self):
    """Clean up temporary files and directories."""
    if (
        hasattr(self, "_temp_dir")
        and self._temp_dir
        and os.path.exists(self._temp_dir)
    ):
        shutil.rmtree(self._temp_dir, ignore_errors=True)
__exit__(*args)

Proper context manager exit with cleanup.

Source code in gigaspatial/processing/tif_processor.py
def __exit__(self, *args):
    """Proper context manager exit with cleanup."""
    self.cleanup()
__post_init__()

Validate inputs, merge rasters if needed, and set up logging.

Source code in gigaspatial/processing/tif_processor.py
def __post_init__(self):
    """Validate inputs, merge rasters if needed, and set up logging."""
    self.data_store = self.data_store or LocalDataStore()
    self.logger = config.get_logger(self.__class__.__name__)
    self._cache = {}
    self._temp_dir = tempfile.mkdtemp()
    self._merged_file_path = None
    self._reprojected_file_path = None
    self._clipped_file_path = None

    # Handle multiple dataset paths
    if isinstance(self.dataset_path, list):
        self.dataset_paths = [Path(p) for p in self.dataset_path]
        self._validate_multiple_datasets()
        self._merge_rasters()
        self.dataset_path = self._merged_file_path
    else:
        self.dataset_paths = [Path(self.dataset_path)]
        # For absolute paths with LocalDataStore, check file existence directly
        # to avoid path resolution issues
        if isinstance(self.data_store, LocalDataStore) and os.path.isabs(
            str(self.dataset_path)
        ):
            if not os.path.exists(str(self.dataset_path)):
                raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")
        elif not self.data_store.file_exists(str(self.dataset_path)):
            raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")

        # Reproject single raster during initialization if target_crs is set
        if self.target_crs:
            self.logger.info(f"Reprojecting single raster to {self.target_crs}...")
            with self.data_store.open(str(self.dataset_path), "rb") as f:
                with rasterio.MemoryFile(f.read()) as memfile:
                    with memfile.open() as src:
                        self._reprojected_file_path = self._reproject_to_temp_file(
                            src, self.target_crs
                        )
            self.dataset_path = self._reprojected_file_path

    self._load_metadata()
    self._validate_mode_band_compatibility()
cleanup()

Explicit cleanup method for better control.

Source code in gigaspatial/processing/tif_processor.py
def cleanup(self):
    """Explicit cleanup method for better control."""
    if hasattr(self, "_temp_dir") and os.path.exists(self._temp_dir):
        shutil.rmtree(self._temp_dir)
        self.logger.info("Cleaned up temporary files")
clip_to_bounds(bounds, bounds_crs=None, return_clipped_processor=True)

Clip the raster to a rectangular bounding box.

Parameters:

Name Type Description Default
bounds tuple

Bounding box as (minx, miny, maxx, maxy).

required
bounds_crs Optional[str]

The CRS of the input bounds. Defaults to raster CRS.

None
return_clipped_processor bool

If True, returns a new TifProcessor instance.

True

Returns:

Type Description
Union[TifProcessor, tuple]

The clipped TifProcessor or tuple of data/metadata.

Source code in gigaspatial/processing/tif_processor.py
def clip_to_bounds(
    self,
    bounds: tuple,
    bounds_crs: Optional[str] = None,
    return_clipped_processor: bool = True,
) -> Union["TifProcessor", tuple]:
    """
    Clip the raster to a rectangular bounding box.

    Args:
        bounds: Bounding box as (minx, miny, maxx, maxy).
        bounds_crs: The CRS of the input bounds. Defaults to raster CRS.
        return_clipped_processor: If True, returns a new TifProcessor instance.

    Returns:
        The clipped TifProcessor or tuple of data/metadata.
    """
    # Create bounding box geometry
    bbox_geom = box(*bounds)

    # If bounds_crs is specified and different from raster CRS, create GeoDataFrame for reprojection
    if bounds_crs is not None:
        raster_crs = self.crs

        if not self.crs == bounds_crs:
            # Create GeoDataFrame with bounds CRS and reproject
            bbox_gdf = gpd.GeoDataFrame([1], geometry=[bbox_geom], crs=bounds_crs)
            bbox_gdf = bbox_gdf.to_crs(raster_crs)
            bbox_geom = bbox_gdf.geometry.iloc[0]

    return self.clip_to_geometry(
        geometry=bbox_geom,
        crop=True,
        return_clipped_processor=return_clipped_processor,
    )
clip_to_geometry(geometry, crop=True, all_touched=True, invert=False, nodata=None, pad=False, pad_width=0.5, return_clipped_processor=True)

Clip the raster to the boundaries of specific geometries.

Parameters:

Name Type Description Default
geometry Union[Polygon, MultiPolygon, GeoDataFrame, GeoSeries, List[dict], dict]

The geometry to clip to (Polygon, GDF, GeoSeries, etc.).

required
crop bool

If True, the raster's extent is reduced to the geometry's bounding box.

True
all_touched bool

If True, includes all pixels touched by the geometry.

True
invert bool

If True, masks pixels inside the geometry.

False
nodata Optional[Union[int, float]]

Override for the nodata value in the output.

None
pad bool

Whether to pad the geometry before clipping.

False
pad_width float

Width of the padding in pixels.

0.5
return_clipped_processor bool

If True, returns a new TifProcessor instance.

True

Returns:

Type Description
Union[TifProcessor, tuple]

A new TifProcessor instance (if return_clipped_processor is True) or

Union[TifProcessor, tuple]

a tuple of (clipped_array, transform, metadata).

Raises:

Type Description
ValueError

If the geometry does not overlap with the raster or CRS is incompatible.

Source code in gigaspatial/processing/tif_processor.py
def clip_to_geometry(
    self,
    geometry: Union[
        Polygon, MultiPolygon, gpd.GeoDataFrame, gpd.GeoSeries, List[dict], dict
    ],
    crop: bool = True,
    all_touched: bool = True,
    invert: bool = False,
    nodata: Optional[Union[int, float]] = None,
    pad: bool = False,
    pad_width: float = 0.5,
    return_clipped_processor: bool = True,
) -> Union["TifProcessor", tuple]:
    """
    Clip the raster to the boundaries of specific geometries.

    Args:
        geometry: The geometry to clip to (Polygon, GDF, GeoSeries, etc.).
        crop: If True, the raster's extent is reduced to the geometry's bounding box.
        all_touched: If True, includes all pixels touched by the geometry.
        invert: If True, masks pixels *inside* the geometry.
        nodata: Override for the nodata value in the output.
        pad: Whether to pad the geometry before clipping.
        pad_width: Width of the padding in pixels.
        return_clipped_processor: If True, returns a new TifProcessor instance.

    Returns:
        A new TifProcessor instance (if return_clipped_processor is True) or
        a tuple of (clipped_array, transform, metadata).

    Raises:
        ValueError: If the geometry does not overlap with the raster or CRS is incompatible.
    """
    # Handle different geometry input types
    shapes = self._prepare_geometry_for_clipping(geometry)

    # Validate CRS compatibility
    self._validate_geometry_crs(geometry)

    # Perform the clipping
    with self.open_dataset() as src:
        try:
            clipped_data, clipped_transform = mask(
                dataset=src,
                shapes=shapes,
                crop=crop,
                all_touched=all_touched,
                invert=invert,
                nodata=nodata,
                pad=pad,
                pad_width=pad_width,
                filled=True,
            )

            # Update metadata for the clipped raster
            clipped_meta = src.meta.copy()
            clipped_meta.update(
                {
                    "height": clipped_data.shape[1],
                    "width": clipped_data.shape[2],
                    "transform": clipped_transform,
                    "nodata": nodata if nodata is not None else src.nodata,
                }
            )

        except ValueError as e:
            if "Input shapes do not overlap raster" in str(e):
                raise ValueError(
                    "The geometry does not overlap with the raster. "
                    "Check that both are in the same coordinate reference system."
                ) from e
            else:
                raise e

    if return_clipped_processor:
        # Create a new TifProcessor with the clipped data
        return self._create_clipped_processor(clipped_data, clipped_meta)
    else:
        return clipped_data, clipped_transform, clipped_meta
get_raster_info(include_statistics=False, approx_ok=False)

Get comprehensive metadata and statistics for the raster.

Parameters:

Name Type Description Default
include_statistics bool

Whether to compute pixel statistics (mean, std, etc.).

False
approx_ok bool

Whether to allow approximate statistics for speed.

False

Returns:

Type Description
Dict[str, Any]

Dictionary containing metadata like dimensions, CRS, bounds, and optionally statistics.

Source code in gigaspatial/processing/tif_processor.py
def get_raster_info(
    self,
    include_statistics: bool = False,
    approx_ok: bool = False,
) -> Dict[str, Any]:
    """
    Get comprehensive metadata and statistics for the raster.

    Args:
        include_statistics: Whether to compute pixel statistics (mean, std, etc.).
        approx_ok: Whether to allow approximate statistics for speed.

    Returns:
        Dictionary containing metadata like dimensions, CRS, bounds, and optionally statistics.
    """
    info = {
        "count": self.count,
        "width": self.width,
        "height": self.height,
        "crs": self.crs,
        "bounds": self.bounds,
        "transform": self.transform,
        "dtypes": self.dtype,
        "nodata": self.nodata,
        "mode": self.mode,
        "is_merged": self.is_merged,
        "source_count": self.source_count,
    }

    if include_statistics:
        info["statistics"] = self._get_basic_statistics(approx_ok=approx_ok)

    return info
open_dataset()

Context manager for robustly accessing the TIF dataset.

Automatically handles access to original, merged, reprojected, or clipped files across different data stores.

Yields:

Type Description

A rasterio.DatasetReader object.

Source code in gigaspatial/processing/tif_processor.py
@contextmanager
def open_dataset(self):
    """
    Context manager for robustly accessing the TIF dataset.

    Automatically handles access to original, merged, reprojected, or
    clipped files across different data stores.

    Yields:
        A rasterio.DatasetReader object.
    """
    if self._merged_file_path:
        with rasterio.open(self._merged_file_path) as src:
            yield src
    elif self._reprojected_file_path:
        with rasterio.open(self._reprojected_file_path) as src:
            yield src
    elif self._clipped_file_path:
        with rasterio.open(self._clipped_file_path) as src:
            yield src
    elif isinstance(self.data_store, LocalDataStore):
        with rasterio.open(str(self.dataset_path)) as src:
            yield src
    else:
        with self.data_store.open(str(self.dataset_path), "rb") as f:
            with rasterio.MemoryFile(f.read()) as memfile:
                with memfile.open() as src:
                    yield src
reproject_to(target_crs, output_path=None, resampling_method=None, resolution=None)

Reproject the current raster to a new CRS.

Parameters:

Name Type Description Default
target_crs str

The destination CRS (e.g., "EPSG:4326").

required
output_path Optional[Union[str, Path]]

Optional path to save the result. If None, saves to temp.

None
resampling_method Optional[Resampling]

Optional override for resampling.

None
resolution Optional[Tuple[float, float]]

Optional target pixel resolution (x, y).

None

Returns:

Type Description

Path to the reprojected file.

Source code in gigaspatial/processing/tif_processor.py
def reproject_to(
    self,
    target_crs: str,
    output_path: Optional[Union[str, Path]] = None,
    resampling_method: Optional[Resampling] = None,
    resolution: Optional[Tuple[float, float]] = None,
):
    """
    Reproject the current raster to a new CRS.

    Args:
        target_crs: The destination CRS (e.g., "EPSG:4326").
        output_path: Optional path to save the result. If None, saves to temp.
        resampling_method: Optional override for resampling.
        resolution: Optional target pixel resolution (x, y).

    Returns:
        Path to the reprojected file.
    """
    self.logger.info(f"Reprojecting raster to {target_crs}...")

    # Use provided or default values
    resampling_method = resampling_method or self.resampling_method
    resolution = resolution or self.reprojection_resolution

    with self.open_dataset() as src:
        if src.crs.to_string() == target_crs:
            self.logger.info(
                "Raster is already in the target CRS. No reprojection needed."
            )
            # If output_path is specified, copy the file
            if output_path:
                self.data_store.copy_file(str(self.dataset_path), output_path)
            return self.dataset_path

        dst_path = output_path or os.path.join(
            self._temp_dir, f"reprojected_single_{os.urandom(8).hex()}.tif"
        )

        with rasterio.open(
            dst_path,
            "w",
            **self._get_reprojection_profile(src, target_crs, resolution),
        ) as dst:
            for band_idx in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, band_idx),
                    destination=rasterio.band(dst, band_idx),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=dst.transform,
                    dst_crs=dst.crs,
                    resampling=resampling_method,
                    num_threads=multiprocessing.cpu_count(),
                )

        self.logger.info(f"Reprojection complete. Output saved to {dst_path}")
        return Path(dst_path)
sample_by_coordinates(coordinate_list, **kwargs)

Extract raster values at specific point coordinates.

Parameters:

Name Type Description Default
coordinate_list List[Tuple[float, float]]

List of (longitude, latitude) tuples.

required
**kwargs

Additional arguments passed to rasterio.sample.

{}

Returns:

Type Description
Union[ndarray, dict]

Numpy array of values (single-band) or dict of band values (RGB/RGBA).

Source code in gigaspatial/processing/tif_processor.py
def sample_by_coordinates(
    self, coordinate_list: List[Tuple[float, float]], **kwargs
) -> Union[np.ndarray, dict]:
    """
    Extract raster values at specific point coordinates.

    Args:
        coordinate_list: List of (longitude, latitude) tuples.
        **kwargs: Additional arguments passed to rasterio.sample.

    Returns:
        Numpy array of values (single-band) or dict of band values (RGB/RGBA).
    """
    self.logger.info("Sampling raster values at the coordinates...")

    with self.open_dataset() as src:
        if self.mode == "rgba":
            if self.count != 4:
                raise ValueError("RGBA mode requires a 4-band TIF file")

            rgba_values = {"red": [], "green": [], "blue": [], "alpha": []}

            for band_idx, color in enumerate(["red", "green", "blue", "alpha"], 1):
                rgba_values[color] = [
                    vals[0]
                    for vals in src.sample(coordinate_list, indexes=band_idx)
                ]

            return rgba_values

        elif self.mode == "rgb":
            if self.count != 3:
                raise ValueError("RGB mode requires a 3-band TIF file")

            rgb_values = {"red": [], "green": [], "blue": []}

            for band_idx, color in enumerate(["red", "green", "blue"], 1):
                rgb_values[color] = [
                    vals[0]
                    for vals in src.sample(coordinate_list, indexes=band_idx)
                ]

            return rgb_values
        elif self.count > 1:
            return np.array(
                [vals for vals in src.sample(coordinate_list, **kwargs)]
            )
        else:
            return np.array([vals[0] for vals in src.sample(coordinate_list)])
sample_by_polygons(polygon_list, stat='mean')

Sample raster values within polygons and compute aggregate statistics.

Parameters:

Name Type Description Default
polygon_list

List of Shapely Polygon or MultiPolygon objects.

required
stat Union[str, Callable, List[Union[str, Callable]]]

Statistic(s) to compute. Can be a string (e.g., 'mean'), a callable, or a list of both.

'mean'

Returns:

Type Description

Numpy array of results (if single stat) or a list of dictionaries (if multi-stat).

Source code in gigaspatial/processing/tif_processor.py
def sample_by_polygons(
    self,
    polygon_list,
    stat: Union[str, Callable, List[Union[str, Callable]]] = "mean",
):
    """
    Sample raster values within polygons and compute aggregate statistics.

    Args:
        polygon_list: List of Shapely Polygon or MultiPolygon objects.
        stat: Statistic(s) to compute. Can be a string (e.g., 'mean'),
              a callable, or a list of both.

    Returns:
        Numpy array of results (if single stat) or a list of dictionaries (if multi-stat).
    """
    # Determine if single or multiple stats
    single_stat = not isinstance(stat, list)
    stats_list = [stat] if single_stat else stat

    # Prepare stat functions
    stat_funcs = []
    stat_names = []

    for s in stats_list:
        if callable(s):
            stat_funcs.append(s)
            stat_names.append(
                s.__name__
                if hasattr(s, "__name__")
                else f"custom_{len(stat_names)}"
            )
        else:
            # Handle string statistics
            if s == "count":
                stat_funcs.append(len)
            else:
                stat_funcs.append(getattr(np, s))
            stat_names.append(s)

    results = []

    with self.open_dataset() as src:
        for polygon in tqdm(polygon_list):
            try:
                out_image, _ = mask(src, [polygon], crop=True, filled=False)

                # Use masked arrays for more efficient nodata handling
                if hasattr(out_image, "mask"):
                    valid_data = out_image.compressed()
                else:
                    valid_data = (
                        out_image[out_image != self.nodata]
                        if self.nodata
                        else out_image.flatten()
                    )

                if len(valid_data) == 0:
                    if single_stat:
                        results.append(np.nan)
                    else:
                        results.append({name: np.nan for name in stat_names})
                else:
                    if single_stat:
                        results.append(stat_funcs[0](valid_data))
                    else:
                        # Compute all statistics for this polygon
                        polygon_stats = {}
                        for func, name in zip(stat_funcs, stat_names):
                            try:
                                polygon_stats[name] = func(valid_data)
                            except Exception:
                                polygon_stats[name] = np.nan
                        results.append(polygon_stats)

            except Exception:
                if single_stat:
                    results.append(np.nan)
                else:
                    results.append({name: np.nan for name in stat_names})

    return np.array(results) if single_stat else results
sample_by_polygons_batched(polygon_list, stat='mean', batch_size=100, n_workers=4, show_progress=True, check_memory=True, **kwargs)

Sample raster values by polygons in parallel using batch processing.

Efficiently distributes sampling tasks across multiple worker processes.

Parameters:

Name Type Description Default
polygon_list List[Union[Polygon, MultiPolygon]]

List of Shapely Polygon or MultiPolygon objects.

required
stat Union[str, Callable]

Statistic to compute for each polygon.

'mean'
batch_size int

Number of polygons to process in each worker batch.

100
n_workers int

Number of parallel processes to use.

4
show_progress bool

If True, displays a progress bar.

True
check_memory bool

If True, validates memory availability before starting.

True
**kwargs

Additional arguments.

{}

Returns:

Type Description
ndarray

Numpy array of statistics for each polygon.

Source code in gigaspatial/processing/tif_processor.py
def sample_by_polygons_batched(
    self,
    polygon_list: List[Union[Polygon, MultiPolygon]],
    stat: Union[str, Callable] = "mean",
    batch_size: int = 100,
    n_workers: int = 4,
    show_progress: bool = True,
    check_memory: bool = True,
    **kwargs,
) -> np.ndarray:
    """
    Sample raster values by polygons in parallel using batch processing.

    Efficiently distributes sampling tasks across multiple worker processes.

    Args:
        polygon_list: List of Shapely Polygon or MultiPolygon objects.
        stat: Statistic to compute for each polygon.
        batch_size: Number of polygons to process in each worker batch.
        n_workers: Number of parallel processes to use.
        show_progress: If True, displays a progress bar.
        check_memory: If True, validates memory availability before starting.
        **kwargs: Additional arguments.

    Returns:
        Numpy array of statistics for each polygon.
    """
    import sys

    # Memory guard check with n_workers consideration
    if check_memory:
        is_safe = self._memory_guard(
            "batched_sampling",
            threshold_percent=85.0,
            n_workers=n_workers,
            raise_error=False,
        )

        if not is_safe:
            # Suggest reducing n_workers
            memory_info = self._check_available_memory()
            estimates = self._estimate_memory_usage("batched_sampling", n_workers=1)

            # Calculate optimal workers
            suggested_workers = max(
                1, int(memory_info["available"] * 0.7 / estimates["per_worker"])
            )

            warnings.warn(
                f"Consider reducing n_workers from {n_workers} to {suggested_workers} "
                f"to reduce memory pressure.",
                ResourceWarning,
            )

    # Platform check
    if sys.platform in ["win32", "darwin"]:
        import warnings
        import multiprocessing as mp

        if mp.get_start_method(allow_none=True) != "fork":
            warnings.warn(
                "Batched sampling may not work on Windows/macOS. "
                "Use sample_by_polygons() if you encounter errors.",
                RuntimeWarning,
            )

    def _chunk_list(data_list, chunk_size):
        """Yield successive chunks from data_list."""
        for i in range(0, len(data_list), chunk_size):
            yield data_list[i : i + chunk_size]

    if len(polygon_list) == 0:
        return np.array([])

    stat_func = stat if callable(stat) else getattr(np, stat)
    polygon_chunks = list(_chunk_list(polygon_list, batch_size))

    with multiprocessing.Pool(
        initializer=self._initializer_worker, processes=n_workers
    ) as pool:
        process_func = partial(self._process_polygon_batch, stat_func=stat_func)
        if show_progress:
            batched_results = list(
                tqdm(
                    pool.imap(process_func, polygon_chunks),
                    total=len(polygon_chunks),
                    desc=f"Sampling polygons",
                )
            )
        else:
            batched_results = list(pool.imap(process_func, polygon_chunks))

        results = [item for sublist in batched_results for item in sublist]

    return np.array(results)
save_array_to_file(array, output_path, compress='LZW', tiled=True, blocksize=512, crs=None, transform=None, nodata=None, **kwargs)

Save a numpy array to a raster file using metadata from this processor.

Parameters:

Name Type Description Default
array ndarray

2D or 3D array of data to save.

required
output_path Union[str, Path]

Destination file path.

required
compress Optional[str]

Compression method.

'LZW'
tiled bool

If True, tiles the output.

True
blocksize int

Block size for tiled output.

512
crs Optional[Any]

Optional CRS override.

None
transform Optional[Any]

Optional Affine transform override.

None
nodata Optional[float]

Optional nodata value override.

None
**kwargs

Additional creation options.

{}

Returns:

Type Description
Path

Path to the saved TIF file.

Source code in gigaspatial/processing/tif_processor.py
def save_array_to_file(
    self,
    array: np.ndarray,
    output_path: Union[str, Path],
    compress: Optional[str] = "LZW",
    tiled: bool = True,
    blocksize: int = 512,
    crs: Optional[Any] = None,
    transform: Optional[Any] = None,
    nodata: Optional[float] = None,
    **kwargs,
) -> Path:
    """
    Save a numpy array to a raster file using metadata from this processor.

    Args:
        array: 2D or 3D array of data to save.
        output_path: Destination file path.
        compress: Compression method.
        tiled: If True, tiles the output.
        blocksize: Block size for tiled output.
        crs: Optional CRS override.
        transform: Optional Affine transform override.
        nodata: Optional nodata value override.
        **kwargs: Additional creation options.

    Returns:
        Path to the saved TIF file.
    """
    output_path = Path(output_path)

    # Ensure array is at least 3D
    if array.ndim == 2:
        array = array[np.newaxis, :, :]
    elif array.ndim != 3:
        raise ValueError(f"Array must be 2D or 3D, got shape {array.shape}")

    num_bands = array.shape[0]
    height = array.shape[1]
    width = array.shape[2]

    # Get metadata from source using open_dataset
    with self.open_dataset() as src:
        if crs is None:
            crs = src.crs
        if transform is None:
            transform = src.transform
        if nodata is None:
            nodata = src.nodata
        dtype = array.dtype

    # Build profile
    profile = {
        "driver": "GTiff",
        "height": height,
        "width": width,
        "count": num_bands,
        "dtype": dtype,
        "crs": crs,
        "transform": transform,
        "nodata": nodata,
    }

    # Add creation options
    if compress and compress.upper() != "NONE":
        profile["compress"] = compress.upper()
    if tiled:
        profile["tiled"] = True
        profile["blockxsize"] = blocksize
        profile["blockysize"] = blocksize

    profile.update(kwargs)

    # Write to temporary file first
    with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        # Write the file - write all bands
        with rasterio.open(tmp_path, "w", **profile) as dst:
            for band_idx in range(num_bands):
                dst.write(array[band_idx], band_idx + 1)

        # Write through data store
        with open(tmp_path, "rb") as f:
            file_content = f.read()

        self.data_store.write_file(str(output_path), file_content)

        self.logger.info(f"Array saved to {output_path}")

    finally:
        # Clean up temporary file
        try:
            os.remove(tmp_path)
        except OSError:
            pass

    return output_path
save_to_file(output_path, compress='LZW', tiled=True, blocksize=512, bigtiff=None, predictor=None, num_threads=None, cog=False, overviews=None, overview_resampling='nearest', **kwargs)

Export the raster to a file with optimized settings.

Parameters:

Name Type Description Default
output_path Union[str, Path]

Output file path.

required
compress Optional[str]

Compression method (e.g., 'LZW', 'ZSTD').

'LZW'
tiled bool

If True, tiles the output for better performance.

True
blocksize int

Block size for tiled output.

512
bigtiff Optional[str]

'YES', 'NO', or 'IF_NEEDED' for large files.

None
predictor Optional[int]

Compression predictor (2 for int, 3 for float).

None
num_threads Optional[int]

Number of threads for compression.

None
cog bool

If True, creates a Cloud-Optimized GeoTIFF.

False
overviews Optional[List[int]]

Overview levels for COG.

None
overview_resampling str

Resampling method for overviews.

'nearest'
**kwargs

Additional creation options for rasterio.

{}

Returns:

Type Description
Path

Path to the saved TIF file.

Source code in gigaspatial/processing/tif_processor.py
def save_to_file(
    self,
    output_path: Union[str, Path],
    compress: Optional[str] = "LZW",
    tiled: bool = True,
    blocksize: int = 512,
    bigtiff: Optional[str] = None,
    predictor: Optional[int] = None,
    num_threads: Optional[int] = None,
    cog: bool = False,
    overviews: Optional[List[int]] = None,
    overview_resampling: str = "nearest",
    **kwargs,
) -> Path:
    """
    Export the raster to a file with optimized settings.

    Args:
        output_path: Output file path.
        compress: Compression method (e.g., 'LZW', 'ZSTD').
        tiled: If True, tiles the output for better performance.
        blocksize: Block size for tiled output.
        bigtiff: 'YES', 'NO', or 'IF_NEEDED' for large files.
        predictor: Compression predictor (2 for int, 3 for float).
        num_threads: Number of threads for compression.
        cog: If True, creates a Cloud-Optimized GeoTIFF.
        overviews: Overview levels for COG.
        overview_resampling: Resampling method for overviews.
        **kwargs: Additional creation options for rasterio.

    Returns:
        Path to the saved TIF file.
    """
    output_path = Path(output_path)

    # Build creation options
    creation_options = {}

    if compress and compress.upper() != "NONE":
        creation_options["compress"] = compress.upper()

    if tiled:
        creation_options["tiled"] = True
        creation_options["blockxsize"] = blocksize
        creation_options["blockysize"] = blocksize

    if bigtiff:
        creation_options["BIGTIFF"] = bigtiff

    if predictor is not None:
        creation_options["predictor"] = predictor

    if num_threads is not None:
        creation_options["NUM_THREADS"] = num_threads

    # Add compression-specific options
    if compress:
        if compress.upper() == "DEFLATE" and "ZLEVEL" not in kwargs:
            kwargs["ZLEVEL"] = 6  # Default compression level
        elif compress.upper() == "ZSTD" and "ZSTD_LEVEL" not in kwargs:
            kwargs["ZSTD_LEVEL"] = 9  # Default compression level
        elif compress.upper() == "JPEG" and "JPEG_QUALITY" not in kwargs:
            kwargs["JPEG_QUALITY"] = 85  # Default quality
        elif compress.upper() == "WEBP" and "WEBP_LEVEL" not in kwargs:
            kwargs["WEBP_LEVEL"] = 75  # Default quality

    # Merge additional kwargs
    creation_options.update(kwargs)

    # Write to temporary file first (rasterio requires local file)
    with tempfile.NamedTemporaryFile(suffix=".tif", delete=False) as tmp:
        tmp_path = tmp.name

    try:
        # Use open_dataset context manager - handles merged/reprojected/clipped files automatically
        with self.open_dataset() as src:
            profile = src.profile.copy()
            profile.update(**creation_options)

            with rasterio.open(tmp_path, "w", **profile) as dst:
                # Write all bands
                for band_idx in range(1, src.count + 1):
                    data = src.read(band_idx)
                    dst.write(data, band_idx)

                # Add overviews if requested
                if overviews or cog:
                    if overviews is None:
                        # Auto-generate overview levels for COG
                        overviews = [2, 4, 8, 16]
                    dst.build_overviews(
                        overviews, getattr(Resampling, overview_resampling)
                    )

                # Update tags to indicate COG if requested
                if cog:
                    dst.update_tags(LAYOUT="COG")

        # Write through data store
        with open(tmp_path, "rb") as f:
            file_content = f.read()

        self.data_store.write_file(str(output_path), file_content)

        self.logger.info(f"Raster saved to {output_path}")

    finally:
        # Clean up temporary file
        try:
            os.remove(tmp_path)
        except OSError:
            pass

    return output_path
to_dataframe(drop_nodata=True, check_memory=True, min_value=None, max_value=None, **kwargs)

Convert the raster data into a pandas DataFrame.

Parameters:

Name Type Description Default
drop_nodata

If True, pixels with the nodata value are excluded.

True
check_memory

If True, checks system memory availability before loading.

True
min_value Optional[float]

Optional minimum threshold to filter pixels.

None
max_value Optional[float]

Optional maximum threshold to filter pixels.

None
**kwargs

Additional arguments like band_number or band_names.

{}

Returns:

Type Description
DataFrame

A DataFrame with 'lon', 'lat', and band values.

Raises:

Type Description
ValueError

If processing fails due to mode mismatch or invalid data.

Source code in gigaspatial/processing/tif_processor.py
def to_dataframe(
    self,
    drop_nodata=True,
    check_memory=True,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Convert the raster data into a pandas DataFrame.

    Args:
        drop_nodata: If True, pixels with the nodata value are excluded.
        check_memory: If True, checks system memory availability before loading.
        min_value: Optional minimum threshold to filter pixels.
        max_value: Optional maximum threshold to filter pixels.
        **kwargs: Additional arguments like `band_number` or `band_names`.

    Returns:
        A DataFrame with 'lon', 'lat', and band values.

    Raises:
        ValueError: If processing fails due to mode mismatch or invalid data.
    """
    # Memory guard check
    if check_memory:
        self._memory_guard("conversion", threshold_percent=80.0)

    try:
        if self.mode == "single":
            return self._to_dataframe(
                band_number=kwargs.get("band_number", 1),
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
                min_value=min_value,
                max_value=max_value,
            )
        else:
            return self._to_dataframe(
                band_number=None,  # All bands
                drop_nodata=drop_nodata,
                band_names=kwargs.get("band_names", None),
                min_value=min_value,
                max_value=max_value,
            )
    except Exception as e:
        raise ValueError(
            f"Failed to process TIF file in mode '{self.mode}'. "
            f"Please ensure the file is valid and matches the selected mode. "
            f"Original error: {str(e)}"
        )
to_dataframe_chunked(drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs)

Convert raster to DataFrame using memory-efficient chunked processing.

Parameters:

Name Type Description Default
drop_nodata

Whether to exclude pixels with the nodata value.

True
chunk_size

Specific number of rows per chunk. If None, it is auto-calculated.

None
target_memory_mb

Target memory limit per chunk in megabytes.

500
**kwargs

Additional arguments like band_number or band_names.

{}

Returns:

Type Description

A consolidated DataFrame containing all processed chunks.

Source code in gigaspatial/processing/tif_processor.py
def to_dataframe_chunked(
    self, drop_nodata=True, chunk_size=None, target_memory_mb=500, **kwargs
):
    """
    Convert raster to DataFrame using memory-efficient chunked processing.

    Args:
        drop_nodata: Whether to exclude pixels with the nodata value.
        chunk_size: Specific number of rows per chunk. If None, it is auto-calculated.
        target_memory_mb: Target memory limit per chunk in megabytes.
        **kwargs: Additional arguments like `band_number` or `band_names`.

    Returns:
        A consolidated DataFrame containing all processed chunks.
    """

    if chunk_size is None:
        chunk_size = self._calculate_optimal_chunk_size(
            "conversion", target_memory_mb
        )

    windows = self._get_chunk_windows(chunk_size)

    # SIMPLE ROUTING
    if self.mode == "single":
        return self._to_dataframe_chunked(
            windows,
            band_number=kwargs.get("band_number", 1),
            drop_nodata=drop_nodata,
            band_names=kwargs.get("band_names", None),
        )
    else:  # rgb, rgba, multi
        return self._to_dataframe_chunked(
            windows,
            band_number=None,
            drop_nodata=drop_nodata,
            band_names=kwargs.get("band_names", None),
        )
to_geodataframe(check_memory=True, min_value=None, max_value=None, **kwargs)

Convert the raster data into a GeoDataFrame.

Each row represents a pixel, with a Point or Box geometry representing its spatial extent.

Parameters:

Name Type Description Default
check_memory

If True, checks system memory availability.

True
min_value Optional[float]

Optional minimum threshold for pixel values.

None
max_value Optional[float]

Optional maximum threshold for pixel values.

None
**kwargs

Additional arguments passed to to_dataframe.

{}

Returns:

Type Description
GeoDataFrame

A GeoDataFrame containing pixel centroids or boxes and their values.

Source code in gigaspatial/processing/tif_processor.py
def to_geodataframe(
    self,
    check_memory=True,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Convert the raster data into a GeoDataFrame.

    Each row represents a pixel, with a Point or Box geometry representing
    its spatial extent.

    Args:
        check_memory: If True, checks system memory availability.
        min_value: Optional minimum threshold for pixel values.
        max_value: Optional maximum threshold for pixel values.
        **kwargs: Additional arguments passed to `to_dataframe`.

    Returns:
        A GeoDataFrame containing pixel centroids or boxes and their values.
    """
    # Memory guard check
    if check_memory:
        self._memory_guard("conversion", threshold_percent=80.0)

    # Get filtered DataFrame - geometry creation happens AFTER filtering
    df = self.to_dataframe(
        check_memory=False, min_value=min_value, max_value=max_value, **kwargs
    )

    x_res, y_res = self.resolution

    # create bounding box for each pixel
    geometries = [
        box(lon - x_res / 2, lat - y_res / 2, lon + x_res / 2, lat + y_res / 2)
        for lon, lat in zip(df["lon"], df["lat"])
    ]

    gdf = gpd.GeoDataFrame(df, geometry=geometries, crs=self.crs)
    return gdf
to_graph(connectivity=4, band=None, include_coordinates=False, graph_type='networkx', check_memory=True)

Convert the raster into a graph representation based on pixel adjacency.

Parameters:

Name Type Description Default
connectivity Literal[4, 8]

Neighborhood connectivity (4 for von Neumann, 8 for Moore).

4
band Optional[int]

Band number to use for node values (1-indexed).

None
include_coordinates bool

If True, adds 'x' and 'y' attributes to nodes.

False
graph_type Literal['networkx', 'sparse']

Output type ('networkx' for Graph object, 'sparse' for CSR matrix).

'networkx'
check_memory bool

If True, validates memory availability before processing.

True

Returns:

Type Description
Union[Graph, csr_matrix]

A NetworkX Graph or a SciPy sparse CSR matrix.

Source code in gigaspatial/processing/tif_processor.py
def to_graph(
    self,
    connectivity: Literal[4, 8] = 4,
    band: Optional[int] = None,
    include_coordinates: bool = False,
    graph_type: Literal["networkx", "sparse"] = "networkx",
    check_memory: bool = True,
) -> Union[nx.Graph, sp.csr_matrix]:
    """
    Convert the raster into a graph representation based on pixel adjacency.

    Args:
        connectivity: Neighborhood connectivity (4 for von Neumann, 8 for Moore).
        band: Band number to use for node values (1-indexed).
        include_coordinates: If True, adds 'x' and 'y' attributes to nodes.
        graph_type: Output type ('networkx' for Graph object, 'sparse' for CSR matrix).
        check_memory: If True, validates memory availability before processing.

    Returns:
        A NetworkX Graph or a SciPy sparse CSR matrix.
    """

    # Memory guard check
    if check_memory:
        self._memory_guard("graph", threshold_percent=80.0)

    with self.open_dataset() as src:
        band_idx = band - 1 if band is not None else 0
        if band_idx < 0 or band_idx >= src.count:
            raise ValueError(
                f"Band {band} not available. Raster has {src.count} bands"
            )

        data = src.read(band_idx + 1)
        nodata = src.nodata if src.nodata is not None else self.nodata
        valid_mask = (
            data != nodata if nodata is not None else np.ones_like(data, dtype=bool)
        )

        height, width = data.shape

        # Find all valid pixels
        valid_rows, valid_cols = np.where(valid_mask)
        num_valid_pixels = len(valid_rows)

        # Create a sequential mapping from (row, col) to a node ID
        node_map = np.full(data.shape, -1, dtype=int)
        node_map[valid_rows, valid_cols] = np.arange(num_valid_pixels)

        # Define neighborhood offsets
        if connectivity == 4:
            # von Neumann neighborhood (4-connectivity)
            offsets = [(-1, 0), (1, 0), (0, -1), (0, 1)]
        else:  # connectivity == 8
            # Moore neighborhood (8-connectivity)
            offsets = [
                (-1, -1),
                (-1, 0),
                (-1, 1),
                (0, -1),
                (0, 1),
                (1, -1),
                (1, 0),
                (1, 1),
            ]

        # Collect nodes and edges
        nodes_to_add = []
        edges_to_add = []

        for i in range(num_valid_pixels):
            row, col = valid_rows[i], valid_cols[i]
            current_node_id = node_map[row, col]

            # Prepare node attributes
            node_attrs = {"value": float(data[row, col])}
            if include_coordinates:
                x, y = src.xy(row, col)
                node_attrs["x"] = x
                node_attrs["y"] = y
            nodes_to_add.append((current_node_id, node_attrs))

            # Find neighbors and collect edges
            for dy, dx in offsets:
                neighbor_row, neighbor_col = row + dy, col + dx

                # Check if neighbor is within bounds and is a valid pixel
                if (
                    0 <= neighbor_row < height
                    and 0 <= neighbor_col < width
                    and valid_mask[neighbor_row, neighbor_col]
                ):
                    neighbor_node_id = node_map[neighbor_row, neighbor_col]

                    # Ensure each edge is added only once
                    if current_node_id < neighbor_node_id:
                        neighbor_value = float(data[neighbor_row, neighbor_col])
                        edges_to_add.append(
                            (current_node_id, neighbor_node_id, neighbor_value)
                        )

        if graph_type == "networkx":
            G = nx.Graph()
            G.add_nodes_from(nodes_to_add)
            G.add_weighted_edges_from(edges_to_add)
            return G
        else:  # sparse matrix
            edges_array = np.array(edges_to_add)
            row_indices = edges_array[:, 0]
            col_indices = edges_array[:, 1]
            weights = edges_array[:, 2]

            # Add reverse edges for symmetric matrix
            from_idx = np.append(row_indices, col_indices)
            to_idx = np.append(col_indices, row_indices)
            weights = np.append(weights, weights)

            return sp.coo_matrix(
                (weights, (from_idx, to_idx)),
                shape=(num_valid_pixels, num_valid_pixels),
            ).tocsr()
validate_dataset_path(value)

Validates that at least one dataset path is provided.

Source code in gigaspatial/processing/tif_processor.py
@field_validator("dataset_path")
def validate_dataset_path(cls, value):
    """Validates that at least one dataset path is provided."""
    if isinstance(value, list):
        if path_len := len(value):
            if path_len == 1:
                return value[0]
            return value

        raise ValueError("No dataset paths provided.")

    if isinstance(value, (Path, str)):
        return value