Skip to content

Handlers Module

gigaspatial.handlers

base

BaseHandler

Bases: ABC

Abstract base class that orchestrates configuration, downloading, and reading functionality.

This class serves as the main entry point for dataset handlers, providing a unified interface for data acquisition and loading. It manages the lifecycle of config, downloader, and reader components.

Subclasses should implement the abstract methods to provide specific handler types and define how components are created and interact.

Source code in gigaspatial/handlers/base.py
class BaseHandler(ABC):
    """
    Abstract base class that orchestrates configuration, downloading, and reading functionality.

    This class serves as the main entry point for dataset handlers, providing a unified
    interface for data acquisition and loading. It manages the lifecycle of config,
    downloader, and reader components.

    Subclasses should implement the abstract methods to provide specific handler types
    and define how components are created and interact.
    """

    def __init__(
        self,
        config: Optional[BaseHandlerConfig] = None,
        downloader: Optional[BaseHandlerDownloader] = None,
        reader: Optional[BaseHandlerReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the BaseHandler with optional components.

        Args:
            config: Configuration object. If None, will be created via create_config()
            downloader: Downloader instance. If None, will be created via create_downloader()
            reader: Reader instance. If None, will be created via create_reader()
            data_store: Data store instance. Defaults to LocalDataStore if not provided
            logger: Logger instance. If not provided, creates one based on class name
        """
        # Initialize data store first as it's used by other components
        self.data_store = data_store or LocalDataStore()

        # Initialize logger
        self.logger = logger or global_config.get_logger(self.__class__.__name__)

        # Initialize or create config
        self._config = config
        if self._config is None:
            self._config = self.create_config(
                data_store=self.data_store, logger=self.logger
            )

        # Initialize or create downloader
        self._downloader = downloader
        if self._downloader is None:
            self._downloader = self.create_downloader(
                config=self._config, data_store=self.data_store, logger=self.logger
            )

        # Initialize or create reader
        self._reader = reader
        if self._reader is None:
            self._reader = self.create_reader(
                config=self._config, data_store=self.data_store, logger=self.logger
            )

    @property
    def config(self) -> BaseHandlerConfig:
        """Get the configuration object."""
        return self._config

    @property
    def downloader(self) -> BaseHandlerDownloader:
        """Get the downloader object."""
        return self._downloader

    @property
    def reader(self) -> BaseHandlerReader:
        """Get the reader object."""
        return self._reader

    # Abstract factory methods for creating components
    @abstractmethod
    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> BaseHandlerConfig:
        """
        Create and return a configuration object for this handler.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured BaseHandlerConfig instance
        """
        pass

    @abstractmethod
    def create_downloader(
        self,
        config: BaseHandlerConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> BaseHandlerDownloader:
        """
        Create and return a downloader object for this handler.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured BaseHandlerDownloader instance
        """
        pass

    @abstractmethod
    def create_reader(
        self,
        config: BaseHandlerConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> BaseHandlerReader:
        """
        Create and return a reader object for this handler.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured BaseHandlerReader instance
        """
        pass

    # High-level interface methods
    def ensure_data_available(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        force_download: bool = False,
        **kwargs,
    ) -> bool:
        """
        Ensure that data is available for the given source.

        This method checks if the required data exists locally, and if not (or if
        force_download is True), downloads it using the downloader.

        Args:
            source: The data source specification
            force_download: If True, download even if data exists locally
            **kwargs: Additional parameters passed to download methods

        Returns:
            bool: True if data is available after this operation
        """
        try:
            # Resolve what data units are needed
            if hasattr(self.config, "get_relevant_data_units"):
                data_units = self.config.get_relevant_data_units(source, **kwargs)
                data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
            else:
                # Fallback: try to resolve paths directly
                if hasattr(self.reader, "resolve_source_paths"):
                    data_paths = self.reader.resolve_source_paths(source, **kwargs)
                else:
                    self.logger.warning("Cannot determine required data paths")
                    return False

            # Check if data exists (unless force download)
            if not force_download:
                missing_paths = [
                    path
                    for path in data_paths
                    if not self.data_store.file_exists(str(path))
                ]
                if not missing_paths:
                    self.logger.info("All required data is already available")
                    return True

            # Download missing or all data
            if hasattr(self.config, "get_relevant_data_units"):
                data_units = self.config.get_relevant_data_units(source, **kwargs)
                self.downloader.download_data_units(data_units, **kwargs)
            else:
                self.downloader.download(source, **kwargs)

            return True

        except Exception as e:
            self.logger.error(f"Failed to ensure data availability: {e}")
            return False

    def load_data(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        ensure_available: bool = True,
        **kwargs,
    ) -> Any:
        """
        Load data from the given source.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            Loaded data (type depends on specific handler implementation)
        """
        if ensure_available:
            if not self.ensure_data_available(source, **kwargs):
                raise RuntimeError("Could not ensure data availability for loading")

        return self.reader.load(source, **kwargs)

    def download_and_load(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        force_download: bool = False,
        **kwargs,
    ) -> Any:
        """
        Convenience method to download (if needed) and load data in one call.

        Args:
            source: The data source specification
            force_download: If True, download even if data exists locally
            **kwargs: Additional parameters

        Returns:
            Loaded data
        """
        self.ensure_data_available(source, force_download=force_download, **kwargs)
        return self.reader.load(source, **kwargs)

    def get_available_data_info(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
        ],
        **kwargs,
    ) -> dict:
        """
        Get information about available data for the given source.

        Args:
            source: The data source specification
            **kwargs: Additional parameters

        Returns:
            dict: Information about data availability, paths, etc.
        """
        try:
            if hasattr(self.config, "get_relevant_data_units"):
                data_units = self.config.get_relevant_data_units(source, **kwargs)
                data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
            else:
                data_paths = self.reader.resolve_source_paths(source, **kwargs)

            existing_paths = [
                path for path in data_paths if self.data_store.file_exists(str(path))
            ]
            missing_paths = [
                path
                for path in data_paths
                if not self.data_store.file_exists(str(path))
            ]

            return {
                "total_data_units": len(data_paths),
                "available_data_units": len(existing_paths),
                "missing_data_units": len(missing_paths),
                "available_paths": existing_paths,
                "missing_paths": missing_paths,
                "all_available": len(missing_paths) == 0,
            }

        except Exception as e:
            self.logger.error(f"Failed to get data info: {e}")
            return {
                "error": str(e),
                "total_data_units": 0,
                "available_data_units": 0,
                "missing_data_units": 0,
                "available_paths": [],
                "missing_paths": [],
                "all_available": False,
            }

    def cleanup(self):
        """
        Cleanup resources used by the handler.

        Override in subclasses if specific cleanup is needed.
        """
        self.logger.info(f"Cleaning up {self.__class__.__name__}")
        # Subclasses can override to add specific cleanup logic

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        self.cleanup()

    def __repr__(self) -> str:
        """String representation of the handler."""
        return (
            f"{self.__class__.__name__}("
            f"config={self.config.__class__.__name__}, "
            f"downloader={self.downloader.__class__.__name__}, "
            f"reader={self.reader.__class__.__name__})"
        )
config: BaseHandlerConfig property

Get the configuration object.

downloader: BaseHandlerDownloader property

Get the downloader object.

reader: BaseHandlerReader property

Get the reader object.

__enter__()

Context manager entry.

Source code in gigaspatial/handlers/base.py
def __enter__(self):
    """Context manager entry."""
    return self
__exit__(exc_type, exc_val, exc_tb)

Context manager exit.

Source code in gigaspatial/handlers/base.py
def __exit__(self, exc_type, exc_val, exc_tb):
    """Context manager exit."""
    self.cleanup()
__init__(config=None, downloader=None, reader=None, data_store=None, logger=None)

Initialize the BaseHandler with optional components.

Parameters:

Name Type Description Default
config Optional[BaseHandlerConfig]

Configuration object. If None, will be created via create_config()

None
downloader Optional[BaseHandlerDownloader]

Downloader instance. If None, will be created via create_downloader()

None
reader Optional[BaseHandlerReader]

Reader instance. If None, will be created via create_reader()

None
data_store Optional[DataStore]

Data store instance. Defaults to LocalDataStore if not provided

None
logger Optional[Logger]

Logger instance. If not provided, creates one based on class name

None
Source code in gigaspatial/handlers/base.py
def __init__(
    self,
    config: Optional[BaseHandlerConfig] = None,
    downloader: Optional[BaseHandlerDownloader] = None,
    reader: Optional[BaseHandlerReader] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the BaseHandler with optional components.

    Args:
        config: Configuration object. If None, will be created via create_config()
        downloader: Downloader instance. If None, will be created via create_downloader()
        reader: Reader instance. If None, will be created via create_reader()
        data_store: Data store instance. Defaults to LocalDataStore if not provided
        logger: Logger instance. If not provided, creates one based on class name
    """
    # Initialize data store first as it's used by other components
    self.data_store = data_store or LocalDataStore()

    # Initialize logger
    self.logger = logger or global_config.get_logger(self.__class__.__name__)

    # Initialize or create config
    self._config = config
    if self._config is None:
        self._config = self.create_config(
            data_store=self.data_store, logger=self.logger
        )

    # Initialize or create downloader
    self._downloader = downloader
    if self._downloader is None:
        self._downloader = self.create_downloader(
            config=self._config, data_store=self.data_store, logger=self.logger
        )

    # Initialize or create reader
    self._reader = reader
    if self._reader is None:
        self._reader = self.create_reader(
            config=self._config, data_store=self.data_store, logger=self.logger
        )
__repr__()

String representation of the handler.

Source code in gigaspatial/handlers/base.py
def __repr__(self) -> str:
    """String representation of the handler."""
    return (
        f"{self.__class__.__name__}("
        f"config={self.config.__class__.__name__}, "
        f"downloader={self.downloader.__class__.__name__}, "
        f"reader={self.reader.__class__.__name__})"
    )
cleanup()

Cleanup resources used by the handler.

Override in subclasses if specific cleanup is needed.

Source code in gigaspatial/handlers/base.py
def cleanup(self):
    """
    Cleanup resources used by the handler.

    Override in subclasses if specific cleanup is needed.
    """
    self.logger.info(f"Cleaning up {self.__class__.__name__}")
create_config(data_store, logger, **kwargs) abstractmethod

Create and return a configuration object for this handler.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
BaseHandlerConfig

Configured BaseHandlerConfig instance

Source code in gigaspatial/handlers/base.py
@abstractmethod
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> BaseHandlerConfig:
    """
    Create and return a configuration object for this handler.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured BaseHandlerConfig instance
    """
    pass
create_downloader(config, data_store, logger, **kwargs) abstractmethod

Create and return a downloader object for this handler.

Parameters:

Name Type Description Default
config BaseHandlerConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
BaseHandlerDownloader

Configured BaseHandlerDownloader instance

Source code in gigaspatial/handlers/base.py
@abstractmethod
def create_downloader(
    self,
    config: BaseHandlerConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> BaseHandlerDownloader:
    """
    Create and return a downloader object for this handler.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured BaseHandlerDownloader instance
    """
    pass
create_reader(config, data_store, logger, **kwargs) abstractmethod

Create and return a reader object for this handler.

Parameters:

Name Type Description Default
config BaseHandlerConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
BaseHandlerReader

Configured BaseHandlerReader instance

Source code in gigaspatial/handlers/base.py
@abstractmethod
def create_reader(
    self,
    config: BaseHandlerConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> BaseHandlerReader:
    """
    Create and return a reader object for this handler.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured BaseHandlerReader instance
    """
    pass
download_and_load(source, force_download=False, **kwargs)

Convenience method to download (if needed) and load data in one call.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
force_download bool

If True, download even if data exists locally

False
**kwargs

Additional parameters

{}

Returns:

Type Description
Any

Loaded data

Source code in gigaspatial/handlers/base.py
def download_and_load(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    force_download: bool = False,
    **kwargs,
) -> Any:
    """
    Convenience method to download (if needed) and load data in one call.

    Args:
        source: The data source specification
        force_download: If True, download even if data exists locally
        **kwargs: Additional parameters

    Returns:
        Loaded data
    """
    self.ensure_data_available(source, force_download=force_download, **kwargs)
    return self.reader.load(source, **kwargs)
ensure_data_available(source, force_download=False, **kwargs)

Ensure that data is available for the given source.

This method checks if the required data exists locally, and if not (or if force_download is True), downloads it using the downloader.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
force_download bool

If True, download even if data exists locally

False
**kwargs

Additional parameters passed to download methods

{}

Returns:

Name Type Description
bool bool

True if data is available after this operation

Source code in gigaspatial/handlers/base.py
def ensure_data_available(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    force_download: bool = False,
    **kwargs,
) -> bool:
    """
    Ensure that data is available for the given source.

    This method checks if the required data exists locally, and if not (or if
    force_download is True), downloads it using the downloader.

    Args:
        source: The data source specification
        force_download: If True, download even if data exists locally
        **kwargs: Additional parameters passed to download methods

    Returns:
        bool: True if data is available after this operation
    """
    try:
        # Resolve what data units are needed
        if hasattr(self.config, "get_relevant_data_units"):
            data_units = self.config.get_relevant_data_units(source, **kwargs)
            data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
        else:
            # Fallback: try to resolve paths directly
            if hasattr(self.reader, "resolve_source_paths"):
                data_paths = self.reader.resolve_source_paths(source, **kwargs)
            else:
                self.logger.warning("Cannot determine required data paths")
                return False

        # Check if data exists (unless force download)
        if not force_download:
            missing_paths = [
                path
                for path in data_paths
                if not self.data_store.file_exists(str(path))
            ]
            if not missing_paths:
                self.logger.info("All required data is already available")
                return True

        # Download missing or all data
        if hasattr(self.config, "get_relevant_data_units"):
            data_units = self.config.get_relevant_data_units(source, **kwargs)
            self.downloader.download_data_units(data_units, **kwargs)
        else:
            self.downloader.download(source, **kwargs)

        return True

    except Exception as e:
        self.logger.error(f"Failed to ensure data availability: {e}")
        return False
get_available_data_info(source, **kwargs)

Get information about available data for the given source.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame]

The data source specification

required
**kwargs

Additional parameters

{}

Returns:

Name Type Description
dict dict

Information about data availability, paths, etc.

Source code in gigaspatial/handlers/base.py
def get_available_data_info(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
    ],
    **kwargs,
) -> dict:
    """
    Get information about available data for the given source.

    Args:
        source: The data source specification
        **kwargs: Additional parameters

    Returns:
        dict: Information about data availability, paths, etc.
    """
    try:
        if hasattr(self.config, "get_relevant_data_units"):
            data_units = self.config.get_relevant_data_units(source, **kwargs)
            data_paths = self.config.get_data_unit_paths(data_units, **kwargs)
        else:
            data_paths = self.reader.resolve_source_paths(source, **kwargs)

        existing_paths = [
            path for path in data_paths if self.data_store.file_exists(str(path))
        ]
        missing_paths = [
            path
            for path in data_paths
            if not self.data_store.file_exists(str(path))
        ]

        return {
            "total_data_units": len(data_paths),
            "available_data_units": len(existing_paths),
            "missing_data_units": len(missing_paths),
            "available_paths": existing_paths,
            "missing_paths": missing_paths,
            "all_available": len(missing_paths) == 0,
        }

    except Exception as e:
        self.logger.error(f"Failed to get data info: {e}")
        return {
            "error": str(e),
            "total_data_units": 0,
            "available_data_units": 0,
            "missing_data_units": 0,
            "available_paths": [],
            "missing_paths": [],
            "all_available": False,
        }
load_data(source, ensure_available=True, **kwargs)

Load data from the given source.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
Any

Loaded data (type depends on specific handler implementation)

Source code in gigaspatial/handlers/base.py
def load_data(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    ensure_available: bool = True,
    **kwargs,
) -> Any:
    """
    Load data from the given source.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        Loaded data (type depends on specific handler implementation)
    """
    if ensure_available:
        if not self.ensure_data_available(source, **kwargs):
            raise RuntimeError("Could not ensure data availability for loading")

    return self.reader.load(source, **kwargs)

BaseHandlerConfig dataclass

Bases: ABC

Abstract base class for handler configuration objects. Provides standard fields for path, parallelism, data store, and logger. Extend this class for dataset-specific configuration.

Source code in gigaspatial/handlers/base.py
@dataclass
class BaseHandlerConfig(ABC):
    """
    Abstract base class for handler configuration objects.
    Provides standard fields for path, parallelism, data store, and logger.
    Extend this class for dataset-specific configuration.
    """

    base_path: Path = None
    n_workers: int = multiprocessing.cpu_count()
    data_store: DataStore = field(default_factory=LocalDataStore)
    logger: logging.Logger = field(default=None, repr=False)

    def __post_init__(self):
        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def get_relevant_data_units(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
        ],
        **kwargs,
    ):
        if isinstance(source, str):
            data_units = self.get_relevant_data_units_by_country(source, **kwargs)
        elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
            data_units = self.get_relevant_data_units_by_geometry(source, **kwargs)
        elif isinstance(source, Iterable):
            if all(isinstance(p, (Iterable, Point)) for p in source):
                data_units = self.get_relevant_data_units_by_points(source, **kwargs)
            else:
                raise ValueError(
                    "List input to get_relevant_data_units must be all points."
                )
        else:
            raise NotImplementedError(f"Unsupported source type: {type(source)}")

        return data_units

    @abstractmethod
    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> Any:
        """
        Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).
        """
        pass

    @abstractmethod
    def get_relevant_data_units_by_points(
        self, points: Iterable[Union[Point, tuple]], **kwargs
    ) -> Any:
        """
        Given a list of points, return a list of relevant data unit identifiers.
        """
        pass

    def get_relevant_data_units_by_country(self, country: str, **kwargs) -> Any:
        """
        Given a country code or name, return a list of relevant data unit identifiers.
        """
        from gigaspatial.handlers.boundaries import AdminBoundaries

        country_geometry = (
            AdminBoundaries.create(country_code=country, **kwargs)
            .boundaries[0]
            .geometry
        )
        return self.get_relevant_data_units_by_geometry(
            geometry=country_geometry, **kwargs
        )

    @abstractmethod
    def get_data_unit_path(self, unit: Any, **kwargs) -> list:
        """
        Given a data unit identifier, return the corresponding file path.
        """
        pass

    def get_data_unit_paths(self, units: Union[Iterable[Any]], **kwargs) -> list:
        """
        Given data unit identifiers, return the corresponding file paths.
        """
        if not isinstance(units, Iterable):
            units = [units]

        if not units:
            return []

        return [self.get_data_unit_path(unit=unit, **kwargs) for unit in units]
get_data_unit_path(unit, **kwargs) abstractmethod

Given a data unit identifier, return the corresponding file path.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def get_data_unit_path(self, unit: Any, **kwargs) -> list:
    """
    Given a data unit identifier, return the corresponding file path.
    """
    pass
get_data_unit_paths(units, **kwargs)

Given data unit identifiers, return the corresponding file paths.

Source code in gigaspatial/handlers/base.py
def get_data_unit_paths(self, units: Union[Iterable[Any]], **kwargs) -> list:
    """
    Given data unit identifiers, return the corresponding file paths.
    """
    if not isinstance(units, Iterable):
        units = [units]

    if not units:
        return []

    return [self.get_data_unit_path(unit=unit, **kwargs) for unit in units]
get_relevant_data_units_by_country(country, **kwargs)

Given a country code or name, return a list of relevant data unit identifiers.

Source code in gigaspatial/handlers/base.py
def get_relevant_data_units_by_country(self, country: str, **kwargs) -> Any:
    """
    Given a country code or name, return a list of relevant data unit identifiers.
    """
    from gigaspatial.handlers.boundaries import AdminBoundaries

    country_geometry = (
        AdminBoundaries.create(country_code=country, **kwargs)
        .boundaries[0]
        .geometry
    )
    return self.get_relevant_data_units_by_geometry(
        geometry=country_geometry, **kwargs
    )
get_relevant_data_units_by_geometry(geometry, **kwargs) abstractmethod

Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).

Source code in gigaspatial/handlers/base.py
@abstractmethod
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> Any:
    """
    Given a geometry, return a list of relevant data unit identifiers (e.g., tiles, files, resources).
    """
    pass
get_relevant_data_units_by_points(points, **kwargs) abstractmethod

Given a list of points, return a list of relevant data unit identifiers.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def get_relevant_data_units_by_points(
    self, points: Iterable[Union[Point, tuple]], **kwargs
) -> Any:
    """
    Given a list of points, return a list of relevant data unit identifiers.
    """
    pass

BaseHandlerDownloader

Bases: ABC

Abstract base class for handler downloader classes. Standardizes config, data_store, and logger initialization. Extend this class for dataset-specific downloaders.

Source code in gigaspatial/handlers/base.py
class BaseHandlerDownloader(ABC):
    """
    Abstract base class for handler downloader classes.
    Standardizes config, data_store, and logger initialization.
    Extend this class for dataset-specific downloaders.
    """

    def __init__(
        self,
        config: Optional[BaseHandlerConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.config = config
        if data_store:
            self.data_store = data_store
        elif config and hasattr(config, "data_store"):
            self.data_store = config.data_store
        else:
            self.data_store = LocalDataStore()

        self.logger = (
            logger
            or (getattr(config, "logger", None) if config else None)
            or global_config.get_logger(self.__class__.__name__)
        )

    @abstractmethod
    def download_data_unit(self, *args, **kwargs):
        """
        Abstract method to download data. Implement in subclasses.
        """
        pass

    @abstractmethod
    def download_data_units(self, *args, **kwargs):
        """
        Abstract method to download data. Implement in subclasses.
        """
        pass

    @abstractmethod
    def download(self, *args, **kwargs):
        """
        Abstract method to download data. Implement in subclasses.
        """
        pass
download(*args, **kwargs) abstractmethod

Abstract method to download data. Implement in subclasses.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def download(self, *args, **kwargs):
    """
    Abstract method to download data. Implement in subclasses.
    """
    pass
download_data_unit(*args, **kwargs) abstractmethod

Abstract method to download data. Implement in subclasses.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def download_data_unit(self, *args, **kwargs):
    """
    Abstract method to download data. Implement in subclasses.
    """
    pass
download_data_units(*args, **kwargs) abstractmethod

Abstract method to download data. Implement in subclasses.

Source code in gigaspatial/handlers/base.py
@abstractmethod
def download_data_units(self, *args, **kwargs):
    """
    Abstract method to download data. Implement in subclasses.
    """
    pass

BaseHandlerReader

Bases: ABC

Abstract base class for handler reader classes. Provides common methods for resolving source paths and loading data. Supports resolving by country, points, geometry, GeoDataFrame, or explicit paths. Includes generic loader functions for raster and tabular data.

Source code in gigaspatial/handlers/base.py
class BaseHandlerReader(ABC):
    """
    Abstract base class for handler reader classes.
    Provides common methods for resolving source paths and loading data.
    Supports resolving by country, points, geometry, GeoDataFrame, or explicit paths.
    Includes generic loader functions for raster and tabular data.
    """

    def __init__(
        self,
        config: Optional[BaseHandlerConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        self.config = config
        if data_store:
            self.data_store = data_store
        elif config and hasattr(config, "data_store"):
            self.data_store = config.data_store
        else:
            self.data_store = LocalDataStore()

        self.logger = (
            logger
            or (getattr(config, "logger", None) if config else None)
            or global_config.get_logger(self.__class__.__name__)
        )

    def resolve_source_paths(
        self,
        source: Union[
            str,  # country code
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,
            gpd.GeoDataFrame,
            Path,  # path
            str,  # path
            List[Union[str, Path]],
        ],
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        Resolve source data paths based on the type of source input.

        Args:
            source: Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)
            **kwargs: Additional parameters for path resolution

        Returns:
            List of resolved source paths
        """
        if isinstance(source, (str, Path)):
            # Could be a country code or a path
            if self.data_store.file_exists(str(source)) or str(source).endswith(
                (".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
            ):
                source_data_paths = self.resolve_by_paths(source)
            else:
                source_data_paths = self.resolve_by_country(source, **kwargs)
        elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
            source_data_paths = self.resolve_by_geometry(source, **kwargs)
        elif isinstance(source, Iterable):
            # List of points or paths
            if all(isinstance(p, (Iterable, Point)) for p in source):
                source_data_paths = self.resolve_by_points(source, **kwargs)
            elif all(isinstance(p, (str, Path)) for p in source):
                source_data_paths = self.resolve_by_paths(source)
            else:
                raise ValueError(
                    "List input to resolve_source_paths must be all points or all paths."
                )
        else:
            raise NotImplementedError(f"Unsupported source type: {type(source)}")

        self.logger.info(f"Resolved {len(source_data_paths)} paths!")
        return source_data_paths

    def resolve_by_country(self, country: str, **kwargs) -> List[Union[str, Path]]:
        """
        Resolve source paths for a given country code/name.
        Uses the config's get_relevant_data_units_by_country method.
        """
        if not self.config:
            raise ValueError("Config is required for resolving by country")
        data_units = self.config.get_relevant_data_units_by_country(country, **kwargs)
        return self.config.get_data_unit_paths(data_units, **kwargs)

    def resolve_by_points(
        self, points: List[Union[Tuple[float, float], Point]], **kwargs
    ) -> List[Union[str, Path]]:
        """
        Resolve source paths for a list of points.
        Uses the config's get_relevant_data_units_by_points method.
        """
        if not self.config:
            raise ValueError("Config is required for resolving by points")
        data_units = self.config.get_relevant_data_units_by_points(points, **kwargs)
        return self.config.get_data_unit_paths(data_units, **kwargs)

    def resolve_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[Union[str, Path]]:
        """
        Resolve source paths for a geometry or GeoDataFrame.
        Uses the config's get_relevant_data_units_by_geometry method.
        """
        if not self.config:
            raise ValueError("Config is required for resolving by geometry")
        data_units = self.config.get_relevant_data_units_by_geometry(geometry, **kwargs)
        return self.config.get_data_unit_paths(data_units, **kwargs)

    def resolve_by_paths(
        self, paths: Union[Path, str, List[Union[str, Path]]], **kwargs
    ) -> List[Union[str, Path]]:
        """
        Return explicit paths as a list.
        """
        if isinstance(paths, (str, Path)):
            return [paths]
        return list(paths)

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Hook called before loading data."""
        if isinstance(source_data_path, (Path, str)):
            source_data_path = [source_data_path]

        if not source_data_path:
            self.logger.warning("No paths found!")
            return []

        source_data_paths = [str(file_path) for file_path in source_data_path]

        self.logger.info(
            f"Pre-loading validation complete for {len(source_data_path)} files"
        )
        return source_data_paths

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Hook called after loading data."""
        if isinstance(data, Iterable):
            if len(data) == 0:
                self.logger.warning("No data was loaded from the source files")
                return data

            self.logger.info(f"{len(data)} valid data records.")

        self.logger.info(f"Post-load processing complete.")

        return data

    def _check_file_exists(self, file_paths: List[Union[str, Path]]):
        """
        Check that all specified files exist in the data store.

        Args:
            file_paths (List[Union[str, Path]]): List of file paths to check.

        Raises:
            RuntimeError: If any file does not exist in the data store.
        """
        for file_path in file_paths:
            if not self.data_store.file_exists(str(file_path)):
                raise RuntimeError(
                    f"Source file does not exist in the data store: {file_path}"
                )

    def _load_raster_data(
        self, raster_paths: List[Union[str, Path]]
    ) -> List[TifProcessor]:
        """
        Load raster data from file paths.

        Args:
            raster_paths (List[Union[str, Path]]): List of file paths to raster files.

        Returns:
            List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
        """
        return [
            TifProcessor(data_path, self.data_store, mode="single")
            for data_path in raster_paths
        ]

    def _load_tabular_data(
        self, file_paths: List[Union[str, Path]], read_function: Callable = read_dataset
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Load and concatenate tabular data from multiple files.

        Args:
            file_paths (List[Union[str, Path]]): List of file paths to load data from.
            read_function (Callable): Function to use for reading individual files.
                Defaults to read_dataset. Should accept (data_store, file_path) arguments.

        Returns:
            Union[pd.DataFrame, gpd.GeoDataFrame]: Concatenated data from all files.
                Returns empty DataFrame if no data is loaded.
        """
        all_data = []
        for file_path in file_paths:
            all_data.append(read_function(self.data_store, file_path))
        if not all_data:
            return pd.DataFrame()
        result = pd.concat(all_data, ignore_index=True)
        return result

    @abstractmethod
    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> Any:
        """
        Abstract method to load source data from paths.

        Args:
            source_data_path: List of source paths
            **kwargs: Additional parameters for data loading

        Returns:
            Loaded data (DataFrame, GeoDataFrame, etc.)
        """
        pass

    def load(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,
            gpd.GeoDataFrame,
            Path,
            str,
            List[Union[str, Path]],
        ],
        **kwargs,
    ) -> Any:
        """
        Load data from the given source.

        Args:
            source: The data source (country code/name, points, geometry, paths, etc.).
            **kwargs: Additional parameters to pass to the loading process.

        Returns:
            The loaded data. The type depends on the subclass implementation.
        """
        source_data_paths = self.resolve_source_paths(source, **kwargs)
        if not source_data_paths:
            self.logger.warning(
                "No source data paths resolved. There's no matching data to load!"
            )
            return None
        processed_paths = self._pre_load_hook(source_data_paths, **kwargs)
        if not processed_paths:
            self.logger.warning("No valid paths to load data from.")
            return None

        loaded_data = self.load_from_paths(processed_paths, **kwargs)
        return self._post_load_hook(loaded_data, **kwargs)
load(source, **kwargs)

Load data from the given source.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame, Path, str, List[Union[str, Path]]]

The data source (country code/name, points, geometry, paths, etc.).

required
**kwargs

Additional parameters to pass to the loading process.

{}

Returns:

Type Description
Any

The loaded data. The type depends on the subclass implementation.

Source code in gigaspatial/handlers/base.py
def load(
    self,
    source: Union[
        str,  # country
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,
        gpd.GeoDataFrame,
        Path,
        str,
        List[Union[str, Path]],
    ],
    **kwargs,
) -> Any:
    """
    Load data from the given source.

    Args:
        source: The data source (country code/name, points, geometry, paths, etc.).
        **kwargs: Additional parameters to pass to the loading process.

    Returns:
        The loaded data. The type depends on the subclass implementation.
    """
    source_data_paths = self.resolve_source_paths(source, **kwargs)
    if not source_data_paths:
        self.logger.warning(
            "No source data paths resolved. There's no matching data to load!"
        )
        return None
    processed_paths = self._pre_load_hook(source_data_paths, **kwargs)
    if not processed_paths:
        self.logger.warning("No valid paths to load data from.")
        return None

    loaded_data = self.load_from_paths(processed_paths, **kwargs)
    return self._post_load_hook(loaded_data, **kwargs)
load_from_paths(source_data_path, **kwargs) abstractmethod

Abstract method to load source data from paths.

Parameters:

Name Type Description Default
source_data_path List[Union[str, Path]]

List of source paths

required
**kwargs

Additional parameters for data loading

{}

Returns:

Type Description
Any

Loaded data (DataFrame, GeoDataFrame, etc.)

Source code in gigaspatial/handlers/base.py
@abstractmethod
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> Any:
    """
    Abstract method to load source data from paths.

    Args:
        source_data_path: List of source paths
        **kwargs: Additional parameters for data loading

    Returns:
        Loaded data (DataFrame, GeoDataFrame, etc.)
    """
    pass
resolve_by_country(country, **kwargs)

Resolve source paths for a given country code/name. Uses the config's get_relevant_data_units_by_country method.

Source code in gigaspatial/handlers/base.py
def resolve_by_country(self, country: str, **kwargs) -> List[Union[str, Path]]:
    """
    Resolve source paths for a given country code/name.
    Uses the config's get_relevant_data_units_by_country method.
    """
    if not self.config:
        raise ValueError("Config is required for resolving by country")
    data_units = self.config.get_relevant_data_units_by_country(country, **kwargs)
    return self.config.get_data_unit_paths(data_units, **kwargs)
resolve_by_geometry(geometry, **kwargs)

Resolve source paths for a geometry or GeoDataFrame. Uses the config's get_relevant_data_units_by_geometry method.

Source code in gigaspatial/handlers/base.py
def resolve_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> List[Union[str, Path]]:
    """
    Resolve source paths for a geometry or GeoDataFrame.
    Uses the config's get_relevant_data_units_by_geometry method.
    """
    if not self.config:
        raise ValueError("Config is required for resolving by geometry")
    data_units = self.config.get_relevant_data_units_by_geometry(geometry, **kwargs)
    return self.config.get_data_unit_paths(data_units, **kwargs)
resolve_by_paths(paths, **kwargs)

Return explicit paths as a list.

Source code in gigaspatial/handlers/base.py
def resolve_by_paths(
    self, paths: Union[Path, str, List[Union[str, Path]]], **kwargs
) -> List[Union[str, Path]]:
    """
    Return explicit paths as a list.
    """
    if isinstance(paths, (str, Path)):
        return [paths]
    return list(paths)
resolve_by_points(points, **kwargs)

Resolve source paths for a list of points. Uses the config's get_relevant_data_units_by_points method.

Source code in gigaspatial/handlers/base.py
def resolve_by_points(
    self, points: List[Union[Tuple[float, float], Point]], **kwargs
) -> List[Union[str, Path]]:
    """
    Resolve source paths for a list of points.
    Uses the config's get_relevant_data_units_by_points method.
    """
    if not self.config:
        raise ValueError("Config is required for resolving by points")
    data_units = self.config.get_relevant_data_units_by_points(points, **kwargs)
    return self.config.get_data_unit_paths(data_units, **kwargs)
resolve_source_paths(source, **kwargs)

Resolve source data paths based on the type of source input.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame, Path, str, List[Union[str, Path]]]

Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)

required
**kwargs

Additional parameters for path resolution

{}

Returns:

Type Description
List[Union[str, Path]]

List of resolved source paths

Source code in gigaspatial/handlers/base.py
def resolve_source_paths(
    self,
    source: Union[
        str,  # country code
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,
        gpd.GeoDataFrame,
        Path,  # path
        str,  # path
        List[Union[str, Path]],
    ],
    **kwargs,
) -> List[Union[str, Path]]:
    """
    Resolve source data paths based on the type of source input.

    Args:
        source: Can be a country code or name (str), list of points, geometry, GeoDataFrame, or explicit path(s)
        **kwargs: Additional parameters for path resolution

    Returns:
        List of resolved source paths
    """
    if isinstance(source, (str, Path)):
        # Could be a country code or a path
        if self.data_store.file_exists(str(source)) or str(source).endswith(
            (".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
        ):
            source_data_paths = self.resolve_by_paths(source)
        else:
            source_data_paths = self.resolve_by_country(source, **kwargs)
    elif isinstance(source, (BaseGeometry, gpd.GeoDataFrame)):
        source_data_paths = self.resolve_by_geometry(source, **kwargs)
    elif isinstance(source, Iterable):
        # List of points or paths
        if all(isinstance(p, (Iterable, Point)) for p in source):
            source_data_paths = self.resolve_by_points(source, **kwargs)
        elif all(isinstance(p, (str, Path)) for p in source):
            source_data_paths = self.resolve_by_paths(source)
        else:
            raise ValueError(
                "List input to resolve_source_paths must be all points or all paths."
            )
    else:
        raise NotImplementedError(f"Unsupported source type: {type(source)}")

    self.logger.info(f"Resolved {len(source_data_paths)} paths!")
    return source_data_paths

boundaries

AdminBoundaries

Bases: BaseModel

Base class for administrative boundary data with flexible fields.

Source code in gigaspatial/handlers/boundaries.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
class AdminBoundaries(BaseModel):
    """Base class for administrative boundary data with flexible fields."""

    boundaries: List[AdminBoundary] = Field(default_factory=list)
    level: int = Field(
        ...,
        ge=0,
        le=4,
        description="Administrative level (e.g., 0=country, 1=state, etc.)",
    )

    logger: ClassVar = global_config.get_logger("AdminBoundaries")

    _schema_config: ClassVar[Dict[str, Dict[str, str]]] = {
        "gadm": {
            "country_code": "GID_0",
            "id": "GID_{level}",
            "name": "NAME_{level}",
            "parent_id": "GID_{parent_level}",
        },
        "internal": {
            "id": "admin{level}_id_giga",
            "name": "name",
            "name_en": "name_en",
            "country_code": "iso_3166_1_alpha_3",
        },
        "geoBoundaries": {
            "id": "shapeID",
            "name": "shapeName",
            "country_code": "shapeGroup",
        },
    }

    def to_geodataframe(self) -> gpd.GeoDataFrame:
        """Convert the AdminBoundaries to a GeoDataFrame."""
        if not self.boundaries:
            if hasattr(self, "_empty_schema"):
                columns = self._empty_schema
            else:
                columns = ["id", "name", "country_code", "geometry"]
                if self.level > 0:
                    columns.append("parent_id")

            return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)

        return gpd.GeoDataFrame(
            [boundary.model_dump() for boundary in self.boundaries],
            geometry="geometry",
            crs=4326,
        )

    @classmethod
    def get_schema_config(cls) -> Dict[str, Dict[str, str]]:
        """Return field mappings for different data sources"""
        return cls._schema_config

    @classmethod
    def from_gadm(
        cls, country_code: str, admin_level: int = 0, **kwargs
    ) -> "AdminBoundaries":
        """Load and create instance from GADM data."""
        url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{country_code}_{admin_level}.json"
        cls.logger.info(
            f"Loading GADM data for country: {country_code}, admin level: {admin_level} from URL: {url}"
        )
        try:
            gdf = gpd.read_file(url)

            gdf = cls._map_fields(gdf, "gadm", admin_level)

            if admin_level == 0:
                gdf["country_code"] = gdf["id"]
                gdf["name"] = gdf["COUNTRY"]
            elif admin_level == 1:
                gdf["country_code"] = gdf["parent_id"]

            boundaries = [
                AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
            ]
            cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
            return cls(
                boundaries=boundaries, level=admin_level, country_code=country_code
            )

        except (ValueError, HTTPError, FileNotFoundError) as e:
            cls.logger.warning(
                f"Error loading GADM data for {country_code} at admin level {admin_level}: {str(e)}"
            )
            cls.logger.info("Falling back to empty instance")
            return cls._create_empty_instance(country_code, admin_level, "gadm")

    @classmethod
    def from_data_store(
        cls,
        data_store: DataStore,
        path: Union[str, "Path"],
        admin_level: int = 0,
        **kwargs,
    ) -> "AdminBoundaries":
        """Load and create instance from internal data store."""
        cls.logger.info(
            f"Loading data from data store at path: {path}, admin level: {admin_level}"
        )
        try:
            gdf = read_dataset(data_store, str(path), **kwargs)

            if gdf.empty:
                cls.logger.warning(f"No data found at {path}.")
                return cls._create_empty_instance(None, admin_level, "internal")

            gdf = cls._map_fields(gdf, "internal", admin_level)

            if admin_level == 0:
                gdf["id"] = gdf["country_code"]
            else:
                gdf["parent_id"] = gdf["id"].apply(lambda x: x[:-3])

            boundaries = [
                AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
            ]
            cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
            return cls(boundaries=boundaries, level=admin_level)

        except (FileNotFoundError, KeyError) as e:
            cls.logger.warning(
                f"No data found at {path} for admin level {admin_level}: {str(e)}"
            )
            cls.logger.info("Falling back to empty instance")
            return cls._create_empty_instance(None, admin_level, "internal")

    @classmethod
    def from_georepo(
        cls,
        country_code: str = None,
        admin_level: int = 0,
        **kwargs,
    ) -> "AdminBoundaries":
        """
        Load and create instance from GeoRepo (UNICEF) API.

        Args:
            country: Country name (if using name-based lookup)
            iso3: ISO3 code (if using code-based lookup)
            admin_level: Administrative level (0=country, 1=state, etc.)
            api_key: GeoRepo API key (optional)
            email: GeoRepo user email (optional)
            kwargs: Extra arguments (ignored)

        Returns:
            AdminBoundaries instance
        """
        cls.logger.info(
            f"Loading data from UNICEF GeoRepo for country: {country_code}, admin level: {admin_level}"
        )
        from gigaspatial.handlers.unicef_georepo import get_country_boundaries_by_iso3

        # Fetch boundaries from GeoRepo
        geojson = get_country_boundaries_by_iso3(country_code, admin_level=admin_level)

        features = geojson.get("features", [])
        boundaries = []
        parent_level = admin_level - 1

        for feat in features:
            props = feat.get("properties", {})
            geometry = feat.get("geometry")
            shapely_geom = shape(geometry) if geometry else None
            # For admin_level 0, no parent_id
            parent_id = None
            if admin_level > 0:
                parent_id = props.get(f"adm{parent_level}_ucode")

            boundary = AdminBoundary(
                id=props.get("ucode"),
                name=props.get("name"),
                name_en=props.get("name_en"),
                geometry=shapely_geom,
                parent_id=parent_id,
                country_code=country_code,
            )
            boundaries.append(boundary)

        cls.logger.info(
            f"Created {len(boundaries)} AdminBoundary objects from GeoRepo data."
        )

        # Try to infer country_code from first boundary if not set
        if boundaries and not boundaries[0].country_code:
            boundaries[0].country_code = boundaries[0].id[:3]

        return cls(boundaries=boundaries, level=admin_level)

    @classmethod
    def from_geoboundaries(cls, country_code, admin_level: int = 0):
        cls.logger.info(
            f"Searching for geoBoundaries data for country: {country_code}, admin level: {admin_level}"
        )

        country_datasets = HDXConfig.search_datasets(
            query=f'dataseries_name:"geoBoundaries - Subnational Administrative Boundaries" AND groups:"{country_code.lower()}"',
            rows=1,
        )
        if not country_datasets:
            cls.logger.error(f"No datasets found for country: {country_code}")
            raise ValueError(
                "No resources found for the specified country. Please check your search parameters and try again."
            )

        cls.logger.info(f"Found dataset: {country_datasets[0].get('title', 'Unknown')}")

        resources = [
            resource
            for resource in country_datasets[0].get_resources()
            if (
                resource.data["name"]
                == f"geoBoundaries-{country_code.upper()}-ADM{admin_level}.geojson"
            )
        ]

        if not resources:
            cls.logger.error(
                f"No resources found for {country_code} at admin level {admin_level}"
            )
            raise ValueError(
                "No resources found for the specified criteria. Please check your search parameters and try again."
            )

        cls.logger.info(f"Found resource: {resources[0].data.get('name', 'Unknown')}")

        try:
            cls.logger.info("Downloading and processing boundary data...")
            with tempfile.TemporaryDirectory() as tmpdir:
                url, local_path = resources[0].download(folder=tmpdir)
                cls.logger.debug(f"Downloaded file to temporary path: {local_path}")
                with open(local_path, "rb") as f:
                    gdf = gpd.read_file(f)

            gdf = cls._map_fields(gdf, "geoBoundaries", admin_level)
            boundaries = [
                AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
            ]
            cls.logger.info(
                f"Successfully created {len(boundaries)} AdminBoundary objects"
            )
            return cls(boundaries=boundaries, level=admin_level)

        except (ValueError, HTTPError, FileNotFoundError) as e:
            cls.logger.warning(
                f"Error loading geoBoundaries data for {country_code} at admin level {admin_level}: {str(e)}"
            )
            cls.logger.info("Falling back to empty instance")
            return cls._create_empty_instance(
                country_code, admin_level, "geoBoundaries"
            )

    @classmethod
    def create(
        cls,
        country_code: Optional[str] = None,
        admin_level: int = 0,
        data_store: Optional[DataStore] = None,
        path: Optional[Union[str, "Path"]] = None,
        **kwargs,
    ) -> "AdminBoundaries":
        """
        Factory method to create an AdminBoundaries instance using various data sources,
        depending on the provided parameters and global configuration.

        Loading Logic:
            1. If a `data_store` is provided and either a `path` is given or
               `global_config.ADMIN_BOUNDARIES_DATA_DIR` is set:
                - If `path` is not provided but `country_code` is, the path is constructed
                  using `global_config.get_admin_path()`.
                - Loads boundaries from the specified data store and path.

            2. If only `country_code` is provided (no data_store):
                - Attempts to load boundaries from GeoRepo (if available).
                - If GeoRepo is unavailable, attempts to load from GADM.
                - If GADM fails, falls back to geoBoundaries.
                - Raises an error if all sources fail.

            3. If neither `country_code` nor `data_store` is provided:
                - Raises a ValueError.

        Args:
            country_code (Optional[str]): ISO country code (2 or 3 letter) or country name.
            admin_level (int): Administrative level (0=country, 1=state/province, etc.).
            data_store (Optional[DataStore]): Optional data store instance for loading from existing data.
            path (Optional[Union[str, Path]]): Optional path to data file (used with data_store).
            **kwargs: Additional arguments passed to the underlying creation methods.

        Returns:
            AdminBoundaries: Configured instance.

        Raises:
            ValueError: If neither country_code nor (data_store, path) are provided,
                        or if country_code lookup fails.
            RuntimeError: If all data sources fail to load boundaries.

        Examples:
            # Load from a data store (path auto-generated if not provided)
            boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)

            # Load from a specific file in a data store
            boundaries = AdminBoundaries.create(data_store=store, path="data.shp")

            # Load from online sources (GeoRepo, GADM, geoBoundaries)
            boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
        """
        cls.logger.info(
            f"Creating AdminBoundaries instance. Country: {country_code}, "
            f"admin level: {admin_level}, data_store provided: {data_store is not None}, "
            f"path provided: {path is not None}"
        )

        from_data_store = data_store is not None and (
            global_config.ADMIN_BOUNDARIES_DATA_DIR is not None or path is not None
        )

        # Validate input parameters
        if not country_code and not data_store:
            raise ValueError("Either country_code or data_store must be provided.")

        if from_data_store and not path and not country_code:
            raise ValueError(
                "If data_store is provided, either path or country_code must also be specified."
            )

        # Handle data store path first
        if from_data_store:
            iso3_code = None
            if country_code:
                try:
                    iso3_code = pycountry.countries.lookup(country_code).alpha_3
                except LookupError as e:
                    raise ValueError(f"Invalid country code '{country_code}': {e}")

            # Generate path if not provided
            if path is None and iso3_code:
                path = global_config.get_admin_path(
                    country_code=iso3_code,
                    admin_level=admin_level,
                )

            return cls.from_data_store(data_store, path, admin_level, **kwargs)

        # Handle country code path
        if country_code is not None:
            try:
                iso3_code = pycountry.countries.lookup(country_code).alpha_3
            except LookupError as e:
                raise ValueError(f"Invalid country code '{country_code}': {e}")

            # Try GeoRepo first
            if cls._try_georepo(iso3_code, admin_level):
                return cls.from_georepo(iso3_code, admin_level=admin_level)

            # Fallback to GADM
            try:
                cls.logger.info("Attempting to load from GADM.")
                return cls.from_gadm(iso3_code, admin_level, **kwargs)
            except Exception as e:
                cls.logger.warning(
                    f"GADM loading failed: {e}. Falling back to geoBoundaries."
                )

            # Final fallback to geoBoundaries
            try:
                return cls.from_geoboundaries(iso3_code, admin_level)
            except Exception as e:
                cls.logger.error(f"All data sources failed. geoBoundaries error: {e}")
                raise RuntimeError(
                    f"Failed to load administrative boundaries for {country_code} "
                    f"from all available sources (GeoRepo, GADM, geoBoundaries)."
                ) from e

        # This should never be reached due to validation above
        raise ValueError("Unexpected error: no valid data source could be determined.")

    @classmethod
    def _try_georepo(cls, iso3_code: str, admin_level: int) -> bool:
        """Helper method to test GeoRepo availability.

        Args:
            iso3_code: ISO3 country code
            admin_level: Administrative level

        Returns:
            bool: True if GeoRepo is available and working, False otherwise
        """
        try:
            from gigaspatial.handlers.unicef_georepo import GeoRepoClient

            client = GeoRepoClient()
            if client.check_connection():
                cls.logger.info("GeoRepo connection successful.")
                return True
            else:
                cls.logger.info("GeoRepo connection failed.")
                return False

        except ImportError:
            cls.logger.info("GeoRepo client not available (import failed).")
            return False
        except ValueError as e:
            cls.logger.warning(f"GeoRepo initialization failed: {e}")
            return False
        except Exception as e:
            cls.logger.warning(f"GeoRepo error: {e}")
            return False

    @classmethod
    def _create_empty_instance(
        cls, country_code: Optional[str], admin_level: int, source_type: str
    ) -> "AdminBoundaries":
        """Create an empty instance with the required schema structure."""
        # for to_geodataframe() to use later
        instance = cls(boundaries=[], level=admin_level, country_code=country_code)

        schema_fields = set(cls.get_schema_config()[source_type].keys())
        schema_fields.update(["geometry", "country_code", "id", "name", "name_en"])
        if admin_level > 0:
            schema_fields.add("parent_id")

        instance._empty_schema = list(schema_fields)
        return instance

    @classmethod
    def _map_fields(
        cls,
        gdf: gpd.GeoDataFrame,
        source: str,
        current_level: int,
    ) -> gpd.GeoDataFrame:
        """Map source fields to schema fields"""
        config = cls.get_schema_config().get(source, {})
        parent_level = current_level - 1

        field_mapping = {}
        for k, v in config.items():
            if "{parent_level}" in v:
                field_mapping[v.format(parent_level=parent_level)] = k
            elif "{level}" in v:
                field_mapping[v.format(level=current_level)] = k
            else:
                field_mapping[v] = k

        return gdf.rename(columns=field_mapping)
create(country_code=None, admin_level=0, data_store=None, path=None, **kwargs) classmethod

Factory method to create an AdminBoundaries instance using various data sources, depending on the provided parameters and global configuration.

Loading Logic
  1. If a data_store is provided and either a path is given or global_config.ADMIN_BOUNDARIES_DATA_DIR is set:

    • If path is not provided but country_code is, the path is constructed using global_config.get_admin_path().
    • Loads boundaries from the specified data store and path.
  2. If only country_code is provided (no data_store):

    • Attempts to load boundaries from GeoRepo (if available).
    • If GeoRepo is unavailable, attempts to load from GADM.
    • If GADM fails, falls back to geoBoundaries.
    • Raises an error if all sources fail.
  3. If neither country_code nor data_store is provided:

    • Raises a ValueError.

Parameters:

Name Type Description Default
country_code Optional[str]

ISO country code (2 or 3 letter) or country name.

None
admin_level int

Administrative level (0=country, 1=state/province, etc.).

0
data_store Optional[DataStore]

Optional data store instance for loading from existing data.

None
path Optional[Union[str, Path]]

Optional path to data file (used with data_store).

None
**kwargs

Additional arguments passed to the underlying creation methods.

{}

Returns:

Name Type Description
AdminBoundaries AdminBoundaries

Configured instance.

Raises:

Type Description
ValueError

If neither country_code nor (data_store, path) are provided, or if country_code lookup fails.

RuntimeError

If all data sources fail to load boundaries.

Examples:

Load from a data store (path auto-generated if not provided)

boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)

Load from a specific file in a data store

boundaries = AdminBoundaries.create(data_store=store, path="data.shp")

Load from online sources (GeoRepo, GADM, geoBoundaries)

boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def create(
    cls,
    country_code: Optional[str] = None,
    admin_level: int = 0,
    data_store: Optional[DataStore] = None,
    path: Optional[Union[str, "Path"]] = None,
    **kwargs,
) -> "AdminBoundaries":
    """
    Factory method to create an AdminBoundaries instance using various data sources,
    depending on the provided parameters and global configuration.

    Loading Logic:
        1. If a `data_store` is provided and either a `path` is given or
           `global_config.ADMIN_BOUNDARIES_DATA_DIR` is set:
            - If `path` is not provided but `country_code` is, the path is constructed
              using `global_config.get_admin_path()`.
            - Loads boundaries from the specified data store and path.

        2. If only `country_code` is provided (no data_store):
            - Attempts to load boundaries from GeoRepo (if available).
            - If GeoRepo is unavailable, attempts to load from GADM.
            - If GADM fails, falls back to geoBoundaries.
            - Raises an error if all sources fail.

        3. If neither `country_code` nor `data_store` is provided:
            - Raises a ValueError.

    Args:
        country_code (Optional[str]): ISO country code (2 or 3 letter) or country name.
        admin_level (int): Administrative level (0=country, 1=state/province, etc.).
        data_store (Optional[DataStore]): Optional data store instance for loading from existing data.
        path (Optional[Union[str, Path]]): Optional path to data file (used with data_store).
        **kwargs: Additional arguments passed to the underlying creation methods.

    Returns:
        AdminBoundaries: Configured instance.

    Raises:
        ValueError: If neither country_code nor (data_store, path) are provided,
                    or if country_code lookup fails.
        RuntimeError: If all data sources fail to load boundaries.

    Examples:
        # Load from a data store (path auto-generated if not provided)
        boundaries = AdminBoundaries.create(country_code="USA", admin_level=1, data_store=store)

        # Load from a specific file in a data store
        boundaries = AdminBoundaries.create(data_store=store, path="data.shp")

        # Load from online sources (GeoRepo, GADM, geoBoundaries)
        boundaries = AdminBoundaries.create(country_code="USA", admin_level=1)
    """
    cls.logger.info(
        f"Creating AdminBoundaries instance. Country: {country_code}, "
        f"admin level: {admin_level}, data_store provided: {data_store is not None}, "
        f"path provided: {path is not None}"
    )

    from_data_store = data_store is not None and (
        global_config.ADMIN_BOUNDARIES_DATA_DIR is not None or path is not None
    )

    # Validate input parameters
    if not country_code and not data_store:
        raise ValueError("Either country_code or data_store must be provided.")

    if from_data_store and not path and not country_code:
        raise ValueError(
            "If data_store is provided, either path or country_code must also be specified."
        )

    # Handle data store path first
    if from_data_store:
        iso3_code = None
        if country_code:
            try:
                iso3_code = pycountry.countries.lookup(country_code).alpha_3
            except LookupError as e:
                raise ValueError(f"Invalid country code '{country_code}': {e}")

        # Generate path if not provided
        if path is None and iso3_code:
            path = global_config.get_admin_path(
                country_code=iso3_code,
                admin_level=admin_level,
            )

        return cls.from_data_store(data_store, path, admin_level, **kwargs)

    # Handle country code path
    if country_code is not None:
        try:
            iso3_code = pycountry.countries.lookup(country_code).alpha_3
        except LookupError as e:
            raise ValueError(f"Invalid country code '{country_code}': {e}")

        # Try GeoRepo first
        if cls._try_georepo(iso3_code, admin_level):
            return cls.from_georepo(iso3_code, admin_level=admin_level)

        # Fallback to GADM
        try:
            cls.logger.info("Attempting to load from GADM.")
            return cls.from_gadm(iso3_code, admin_level, **kwargs)
        except Exception as e:
            cls.logger.warning(
                f"GADM loading failed: {e}. Falling back to geoBoundaries."
            )

        # Final fallback to geoBoundaries
        try:
            return cls.from_geoboundaries(iso3_code, admin_level)
        except Exception as e:
            cls.logger.error(f"All data sources failed. geoBoundaries error: {e}")
            raise RuntimeError(
                f"Failed to load administrative boundaries for {country_code} "
                f"from all available sources (GeoRepo, GADM, geoBoundaries)."
            ) from e

    # This should never be reached due to validation above
    raise ValueError("Unexpected error: no valid data source could be determined.")
from_data_store(data_store, path, admin_level=0, **kwargs) classmethod

Load and create instance from internal data store.

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_data_store(
    cls,
    data_store: DataStore,
    path: Union[str, "Path"],
    admin_level: int = 0,
    **kwargs,
) -> "AdminBoundaries":
    """Load and create instance from internal data store."""
    cls.logger.info(
        f"Loading data from data store at path: {path}, admin level: {admin_level}"
    )
    try:
        gdf = read_dataset(data_store, str(path), **kwargs)

        if gdf.empty:
            cls.logger.warning(f"No data found at {path}.")
            return cls._create_empty_instance(None, admin_level, "internal")

        gdf = cls._map_fields(gdf, "internal", admin_level)

        if admin_level == 0:
            gdf["id"] = gdf["country_code"]
        else:
            gdf["parent_id"] = gdf["id"].apply(lambda x: x[:-3])

        boundaries = [
            AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
        ]
        cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
        return cls(boundaries=boundaries, level=admin_level)

    except (FileNotFoundError, KeyError) as e:
        cls.logger.warning(
            f"No data found at {path} for admin level {admin_level}: {str(e)}"
        )
        cls.logger.info("Falling back to empty instance")
        return cls._create_empty_instance(None, admin_level, "internal")
from_gadm(country_code, admin_level=0, **kwargs) classmethod

Load and create instance from GADM data.

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_gadm(
    cls, country_code: str, admin_level: int = 0, **kwargs
) -> "AdminBoundaries":
    """Load and create instance from GADM data."""
    url = f"https://geodata.ucdavis.edu/gadm/gadm4.1/json/gadm41_{country_code}_{admin_level}.json"
    cls.logger.info(
        f"Loading GADM data for country: {country_code}, admin level: {admin_level} from URL: {url}"
    )
    try:
        gdf = gpd.read_file(url)

        gdf = cls._map_fields(gdf, "gadm", admin_level)

        if admin_level == 0:
            gdf["country_code"] = gdf["id"]
            gdf["name"] = gdf["COUNTRY"]
        elif admin_level == 1:
            gdf["country_code"] = gdf["parent_id"]

        boundaries = [
            AdminBoundary(**row_dict) for row_dict in gdf.to_dict("records")
        ]
        cls.logger.info(f"Created {len(boundaries)} AdminBoundary objects.")
        return cls(
            boundaries=boundaries, level=admin_level, country_code=country_code
        )

    except (ValueError, HTTPError, FileNotFoundError) as e:
        cls.logger.warning(
            f"Error loading GADM data for {country_code} at admin level {admin_level}: {str(e)}"
        )
        cls.logger.info("Falling back to empty instance")
        return cls._create_empty_instance(country_code, admin_level, "gadm")
from_georepo(country_code=None, admin_level=0, **kwargs) classmethod

Load and create instance from GeoRepo (UNICEF) API.

Parameters:

Name Type Description Default
country

Country name (if using name-based lookup)

required
iso3

ISO3 code (if using code-based lookup)

required
admin_level int

Administrative level (0=country, 1=state, etc.)

0
api_key

GeoRepo API key (optional)

required
email

GeoRepo user email (optional)

required
kwargs

Extra arguments (ignored)

{}

Returns:

Type Description
AdminBoundaries

AdminBoundaries instance

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def from_georepo(
    cls,
    country_code: str = None,
    admin_level: int = 0,
    **kwargs,
) -> "AdminBoundaries":
    """
    Load and create instance from GeoRepo (UNICEF) API.

    Args:
        country: Country name (if using name-based lookup)
        iso3: ISO3 code (if using code-based lookup)
        admin_level: Administrative level (0=country, 1=state, etc.)
        api_key: GeoRepo API key (optional)
        email: GeoRepo user email (optional)
        kwargs: Extra arguments (ignored)

    Returns:
        AdminBoundaries instance
    """
    cls.logger.info(
        f"Loading data from UNICEF GeoRepo for country: {country_code}, admin level: {admin_level}"
    )
    from gigaspatial.handlers.unicef_georepo import get_country_boundaries_by_iso3

    # Fetch boundaries from GeoRepo
    geojson = get_country_boundaries_by_iso3(country_code, admin_level=admin_level)

    features = geojson.get("features", [])
    boundaries = []
    parent_level = admin_level - 1

    for feat in features:
        props = feat.get("properties", {})
        geometry = feat.get("geometry")
        shapely_geom = shape(geometry) if geometry else None
        # For admin_level 0, no parent_id
        parent_id = None
        if admin_level > 0:
            parent_id = props.get(f"adm{parent_level}_ucode")

        boundary = AdminBoundary(
            id=props.get("ucode"),
            name=props.get("name"),
            name_en=props.get("name_en"),
            geometry=shapely_geom,
            parent_id=parent_id,
            country_code=country_code,
        )
        boundaries.append(boundary)

    cls.logger.info(
        f"Created {len(boundaries)} AdminBoundary objects from GeoRepo data."
    )

    # Try to infer country_code from first boundary if not set
    if boundaries and not boundaries[0].country_code:
        boundaries[0].country_code = boundaries[0].id[:3]

    return cls(boundaries=boundaries, level=admin_level)
get_schema_config() classmethod

Return field mappings for different data sources

Source code in gigaspatial/handlers/boundaries.py
@classmethod
def get_schema_config(cls) -> Dict[str, Dict[str, str]]:
    """Return field mappings for different data sources"""
    return cls._schema_config
to_geodataframe()

Convert the AdminBoundaries to a GeoDataFrame.

Source code in gigaspatial/handlers/boundaries.py
def to_geodataframe(self) -> gpd.GeoDataFrame:
    """Convert the AdminBoundaries to a GeoDataFrame."""
    if not self.boundaries:
        if hasattr(self, "_empty_schema"):
            columns = self._empty_schema
        else:
            columns = ["id", "name", "country_code", "geometry"]
            if self.level > 0:
                columns.append("parent_id")

        return gpd.GeoDataFrame(columns=columns, geometry="geometry", crs=4326)

    return gpd.GeoDataFrame(
        [boundary.model_dump() for boundary in self.boundaries],
        geometry="geometry",
        crs=4326,
    )

AdminBoundary

Bases: BaseModel

Base class for administrative boundary data with flexible fields.

Source code in gigaspatial/handlers/boundaries.py
class AdminBoundary(BaseModel):
    """Base class for administrative boundary data with flexible fields."""

    id: str = Field(..., description="Unique identifier for the administrative unit")
    name: str = Field(..., description="Primary local name")
    geometry: Union[Polygon, MultiPolygon] = Field(
        ..., description="Geometry of the administrative boundary"
    )

    name_en: Optional[str] = Field(
        None, description="English name if different from local name"
    )
    parent_id: Optional[str] = Field(
        None, description="ID of parent administrative unit"
    )
    country_code: Optional[str] = Field(
        None, min_length=3, max_length=3, description="ISO 3166-1 alpha-3 country code"
    )

    class Config:
        arbitrary_types_allowed = True

ghsl

CoordSystem

Bases: int, Enum

Enum for coordinate systems used by GHSL datasets.

Source code in gigaspatial/handlers/ghsl.py
class CoordSystem(int, Enum):
    """Enum for coordinate systems used by GHSL datasets."""

    WGS84 = 4326
    Mollweide = 54009

GHSLDataConfig dataclass

Bases: BaseHandlerConfig

Source code in gigaspatial/handlers/ghsl.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GHSLDataConfig(BaseHandlerConfig):
    # constants
    AVAILABLE_YEARS: List = Field(default=np.append(np.arange(1975, 2031, 5), 2018))
    AVAILABLE_RESOLUTIONS: List = Field(default=[10, 100, 1000])

    # base config
    GHSL_DB_BASE_URL: HttpUrl = Field(
        default="https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/"
    )
    TILES_URL: str = "https://ghsl.jrc.ec.europa.eu/download/GHSL_data_{}_shapefile.zip"

    # user config
    base_path: Path = Field(default=global_config.get_path("ghsl", "bronze"))
    coord_system: CoordSystem = CoordSystem.WGS84
    release: str = "R2023A"

    product: Literal[
        "GHS_BUILT_S",
        "GHS_BUILT_H_AGBH",
        "GHS_BUILT_H_ANBH",
        "GHS_BUILT_V",
        "GHS_POP",
        "GHS_SMOD",
    ] = Field(...)
    year: int = 2020
    resolution: int = 100

    def __post_init__(self):
        super().__post_init__()

    def _load_tiles(self):
        """Load GHSL tiles from tiles shapefile."""
        try:
            self.tiles_gdf = gpd.read_file(self.TILES_URL)
        except Exception as e:
            self.logger.error(f"Failed to download tiles shapefile: {e}")
            raise ValueError(
                f"Could not download GHSL tiles from {self.TILES_URL}"
            ) from e

    @field_validator("year")
    def validate_year(cls, value: str) -> int:
        if value in cls.AVAILABLE_YEARS:
            return value
        raise ValueError(
            f"No datasets found for the provided year: {value}\nAvailable years are: {cls.AVAILABLE_YEARS}"
        )

    @field_validator("resolution")
    def validate_resolution(cls, value: str) -> int:
        if value in cls.AVAILABLE_RESOLUTIONS:
            return value
        raise ValueError(
            f"No datasets found for the provided resolution: {value}\nAvailable resolutions are: {cls.AVAILABLE_RESOLUTIONS}"
        )

    @model_validator(mode="after")
    def validate_configuration(self):
        """
        Validate that the configuration is valid based on dataset availability constraints.

        Specific rules:
        -
        """
        if self.year == 2018 and self.product in ["GHS_BUILT_V", "GHS_POP", "GHS_SMOD"]:
            raise ValueError(f"{self.product} product is not available for 2018")

        if self.resolution == 10 and self.product != "GHS_BUILT_H":
            raise ValueError(
                f"{self.product} product is not available at 10 (10m) resolution"
            )

        if "GHS_BUILT_H" in self.product:
            if self.year != 2018:
                self.logger.warning(
                    "Building height product is only available for 2018, year is set as 2018"
                )
                self.year = 2018

        if self.product == "GHS_BUILT_S":
            if self.year == 2018 and self.resolution != 10:
                self.logger.warning(
                    "Built-up surface product for 2018 is only available at 10m resolution, resolution is set as 10m"
                )
                self.resolution = 10

            if self.resolution == 10 and self.year != 2018:
                self.logger.warning(
                    "Built-up surface product at resolution 10 is only available for 2018, year is set as 2018"
                )
                self.year = 2018

            if self.resolution == 10 and self.coord_system != CoordSystem.Mollweide:
                self.logger.warning(
                    f"Built-up surface product at resolution 10 is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
                )
                self.coord_system = CoordSystem.Mollweide

        if self.product == "GHS_SMOD":
            if self.resolution != 1000:
                self.logger.warning(
                    f"Settlement model (SMOD) product is only available at 1000 (1km) resolution, resolution is set as 1000"
                )
                self.resolution = 1000

            if self.coord_system != CoordSystem.Mollweide:
                self.logger.warning(
                    f"Settlement model (SMOD) product is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
                )
                self.coord_system = CoordSystem.Mollweide

        self.TILES_URL = self.TILES_URL.format(self.coord_system.value)
        self._load_tiles()

        return self

    @property
    def crs(self) -> str:
        return "EPSG:4326" if self.coord_system == CoordSystem.WGS84 else "ESRI:54009"

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[dict]:
        """
        Return intersecting tiles for a given geometry or GeoDataFrame.
        """
        return self._get_relevant_tiles(geometry)

    def get_relevant_data_units_by_points(
        self, points: Iterable[Union[Point, tuple]], **kwargs
    ) -> List[dict]:
        """
        Return intersecting tiles f or a list of points.
        """
        return self._get_relevant_tiles(points)

    def get_data_unit_path(self, unit: str = None, file_ext=".zip", **kwargs) -> Path:
        """Construct and return the path for the configured dataset or dataset tile."""
        info = self._get_product_info()

        tile_path = (
            self.base_path
            / info["product_folder"]
            / (
                f"{info['product_name']}_V{info['product_version']}_0"
                + (f"_{unit}" if unit else "")
                + file_ext
            )
        )

        return tile_path

    def compute_dataset_url(self, tile_id=None) -> str:
        """Compute the download URL for a GHSL dataset."""
        info = self._get_product_info()

        path_segments = [
            str(self.GHSL_DB_BASE_URL),
            info["product_folder"],
            info["product_name"],
            f"V{info['product_version']}-0",
            "tiles" if tile_id else "",
            f"{info['product_name']}_V{info['product_version']}_0"
            + (f"_{tile_id}" if tile_id else "")
            + ".zip",
        ]

        return "/".join(path_segments)

    def _get_relevant_tiles(
        self,
        source: Union[
            BaseGeometry,
            gpd.GeoDataFrame,
            Iterable[Union[Point, tuple]],
        ],
        crs="EPSG:4326",
    ) -> list:
        """
        Identify and return the GHSL tiles that spatially intersect with the given geometry.

        The input geometry can be a Shapely geometry object, a GeoDataFrame,
        or a list of Point objects or (lon, lat) tuples. The method ensures
        the input geometry is in GHSL tiles projection for the spatial intersection.

        Args:
            source: A Shapely geometry, a GeoDataFrame, or a list of Point
                      objects or (lat, lon) tuples representing the area of interest.

        Returns:
            A list the tile ids for the intersecting tiles.

        Raises:
            ValueError: If the input `source` is not one of the supported types.
        """
        if isinstance(source, gpd.GeoDataFrame):
            if source.crs != crs:
                source = source.to_crs(crs)
            search_geom = source.geometry.unary_union
        elif isinstance(
            source,
            BaseGeometry,
        ):
            search_geom = source
        elif isinstance(source, Iterable) and all(
            len(pt) == 2 or isinstance(pt, Point) for pt in source
        ):
            points = [
                pt if isinstance(pt, Point) else Point(pt[1], pt[0]) for pt in source
            ]
            search_geom = MultiPoint(points)
        else:
            raise ValueError(
                f"Expected Geometry, GeoDataFrame or iterable object of Points got {source.__class__}"
            )

        if self.tiles_gdf.crs != crs:
            search_geom = (
                gpd.GeoDataFrame(geometry=[search_geom], crs=crs)
                .to_crs(self.tiles_gdf.crs)
                .geometry[0]
            )

        # Find intersecting tiles
        mask = (
            tile_geom.intersects(search_geom) for tile_geom in self.tiles_gdf.geometry
        )

        intersecting_tiles = self.tiles_gdf.loc[mask, "tile_id"].to_list()

        return intersecting_tiles

    def _get_product_info(self) -> dict:
        """Generate and return common product information used in multiple methods."""
        resolution_str = (
            str(self.resolution)
            if self.coord_system == CoordSystem.Mollweide
            else ("3ss" if self.resolution == 100 else "30ss")
        )
        product_folder = f"{self.product}_GLOBE_{self.release}"
        product_name = f"{self.product}_E{self.year}_GLOBE_{self.release}_{self.coord_system.value}_{resolution_str}"
        product_version = 2 if self.product == "GHS_SMOD" else 1

        return {
            "resolution_str": resolution_str,
            "product_folder": product_folder,
            "product_name": product_name,
            "product_version": product_version,
        }

    def __repr__(self) -> str:
        """Return a string representation of the GHSL dataset configuration."""
        return (
            f"GHSLDataConfig("
            f"product='{self.product}', "
            f"year={self.year}, "
            f"resolution={self.resolution}, "
            f"coord_system={self.coord_system.name}, "
            f"release='{self.release}'"
            f")"
        )
__repr__()

Return a string representation of the GHSL dataset configuration.

Source code in gigaspatial/handlers/ghsl.py
def __repr__(self) -> str:
    """Return a string representation of the GHSL dataset configuration."""
    return (
        f"GHSLDataConfig("
        f"product='{self.product}', "
        f"year={self.year}, "
        f"resolution={self.resolution}, "
        f"coord_system={self.coord_system.name}, "
        f"release='{self.release}'"
        f")"
    )
compute_dataset_url(tile_id=None)

Compute the download URL for a GHSL dataset.

Source code in gigaspatial/handlers/ghsl.py
def compute_dataset_url(self, tile_id=None) -> str:
    """Compute the download URL for a GHSL dataset."""
    info = self._get_product_info()

    path_segments = [
        str(self.GHSL_DB_BASE_URL),
        info["product_folder"],
        info["product_name"],
        f"V{info['product_version']}-0",
        "tiles" if tile_id else "",
        f"{info['product_name']}_V{info['product_version']}_0"
        + (f"_{tile_id}" if tile_id else "")
        + ".zip",
    ]

    return "/".join(path_segments)
get_data_unit_path(unit=None, file_ext='.zip', **kwargs)

Construct and return the path for the configured dataset or dataset tile.

Source code in gigaspatial/handlers/ghsl.py
def get_data_unit_path(self, unit: str = None, file_ext=".zip", **kwargs) -> Path:
    """Construct and return the path for the configured dataset or dataset tile."""
    info = self._get_product_info()

    tile_path = (
        self.base_path
        / info["product_folder"]
        / (
            f"{info['product_name']}_V{info['product_version']}_0"
            + (f"_{unit}" if unit else "")
            + file_ext
        )
    )

    return tile_path
get_relevant_data_units_by_geometry(geometry, **kwargs)

Return intersecting tiles for a given geometry or GeoDataFrame.

Source code in gigaspatial/handlers/ghsl.py
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> List[dict]:
    """
    Return intersecting tiles for a given geometry or GeoDataFrame.
    """
    return self._get_relevant_tiles(geometry)
get_relevant_data_units_by_points(points, **kwargs)

Return intersecting tiles f or a list of points.

Source code in gigaspatial/handlers/ghsl.py
def get_relevant_data_units_by_points(
    self, points: Iterable[Union[Point, tuple]], **kwargs
) -> List[dict]:
    """
    Return intersecting tiles f or a list of points.
    """
    return self._get_relevant_tiles(points)
validate_configuration()

Validate that the configuration is valid based on dataset availability constraints.

Specific rules:
Source code in gigaspatial/handlers/ghsl.py
@model_validator(mode="after")
def validate_configuration(self):
    """
    Validate that the configuration is valid based on dataset availability constraints.

    Specific rules:
    -
    """
    if self.year == 2018 and self.product in ["GHS_BUILT_V", "GHS_POP", "GHS_SMOD"]:
        raise ValueError(f"{self.product} product is not available for 2018")

    if self.resolution == 10 and self.product != "GHS_BUILT_H":
        raise ValueError(
            f"{self.product} product is not available at 10 (10m) resolution"
        )

    if "GHS_BUILT_H" in self.product:
        if self.year != 2018:
            self.logger.warning(
                "Building height product is only available for 2018, year is set as 2018"
            )
            self.year = 2018

    if self.product == "GHS_BUILT_S":
        if self.year == 2018 and self.resolution != 10:
            self.logger.warning(
                "Built-up surface product for 2018 is only available at 10m resolution, resolution is set as 10m"
            )
            self.resolution = 10

        if self.resolution == 10 and self.year != 2018:
            self.logger.warning(
                "Built-up surface product at resolution 10 is only available for 2018, year is set as 2018"
            )
            self.year = 2018

        if self.resolution == 10 and self.coord_system != CoordSystem.Mollweide:
            self.logger.warning(
                f"Built-up surface product at resolution 10 is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
            )
            self.coord_system = CoordSystem.Mollweide

    if self.product == "GHS_SMOD":
        if self.resolution != 1000:
            self.logger.warning(
                f"Settlement model (SMOD) product is only available at 1000 (1km) resolution, resolution is set as 1000"
            )
            self.resolution = 1000

        if self.coord_system != CoordSystem.Mollweide:
            self.logger.warning(
                f"Settlement model (SMOD) product is only available with Mollweide ({CoordSystem.Mollweide}) projection, coordinate system is set as Mollweide"
            )
            self.coord_system = CoordSystem.Mollweide

    self.TILES_URL = self.TILES_URL.format(self.coord_system.value)
    self._load_tiles()

    return self

GHSLDataDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of GHSL datasets.

Source code in gigaspatial/handlers/ghsl.py
class GHSLDataDownloader(BaseHandlerDownloader):
    """A class to handle downloads of GHSL datasets."""

    def __init__(
        self,
        config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        config = (
            config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_id: str,
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> Optional[Union[Path, List[Path]]]:
        """
        Downloads and optionally extracts files for a given tile.

        Args:
            tile_id: tile ID to process.
            extract: If True and the downloaded file is a zip, extract its contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional parameters passed to download methods

        Returns:
            Path to the downloaded file if extract=False,
            List of paths to the extracted files if extract=True,
            None on failure.
        """
        url = self.config.compute_dataset_url(tile_id=tile_id)
        output_path = self.config.get_data_unit_path(tile_id)

        if not extract:
            return self._download_file(url, output_path)

        extracted_files: List[Path] = []
        temp_downloaded_path: Optional[Path] = None

        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
                temp_downloaded_path = Path(temp_file.name)
                self.logger.debug(
                    f"Downloading {url} to temporary file: {temp_downloaded_path}"
                )

                response = requests.get(url, stream=True)
                response.raise_for_status()

                total_size = int(response.headers.get("content-length", 0))

                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {tile_id}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            temp_file.write(chunk)
                            pbar.update(len(chunk))

            self.logger.info(f"Successfully downloaded temporary file!")

            with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
                if file_pattern:
                    import re

                    pattern = re.compile(file_pattern)
                    files_to_extract = [
                        f for f in zip_ref.namelist() if pattern.match(f)
                    ]
                else:
                    files_to_extract = zip_ref.namelist()

                for file in files_to_extract:
                    extracted_path = output_path.parent / Path(file).name
                    with zip_ref.open(file) as source:
                        file_content = source.read()
                        self.data_store.write_file(str(extracted_path), file_content)
                    extracted_files.append(extracted_path)
                    self.logger.info(f"Extracted {file} to {extracted_path}")

            Path(temp_file.name).unlink()
            return extracted_files

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url} to temporary file: {e}")
            return None
        except zipfile.BadZipFile:
            self.logger.error(f"Downloaded file for {tile_id} is not a valid zip file.")
            return None
        except Exception as e:
            self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
            return None
        finally:
            if temp_downloaded_path and temp_downloaded_path.exists():
                try:
                    temp_downloaded_path.unlink()
                    self.logger.debug(f"Deleted temporary file: {temp_downloaded_path}")
                except OSError as e:
                    self.logger.warning(
                        f"Could not delete temporary file {temp_downloaded_path}: {e}"
                    )

    def download_data_units(
        self,
        tile_ids: List[str],
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> List[Optional[Union[Path, List[Path]]]]:
        """
        Downloads multiple tiles in parallel, with an option to extract them.

        Args:
            tile_ids: A list of tile IDs to download.
            extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional parameters passed to download methods

        Returns:
            A list where each element corresponds to a tile ID and contains:
            - Path to the downloaded file if extract=False.
            - List of paths to extracted files if extract=True.
            - None if the download or extraction failed for a tile.
        """
        if not tile_ids:
            self.logger.warning("No tiles to download")
            return []

        with multiprocessing.Pool(processes=self.config.n_workers) as pool:
            download_func = functools.partial(
                self.download_data_unit, extract=extract, file_pattern=file_pattern
            )
            file_paths = list(
                tqdm(
                    pool.imap(download_func, tile_ids),
                    total=len(tile_ids),
                    desc=f"Downloading data",
                )
            )

        return file_paths

    def download(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # shapely geoms
            gpd.GeoDataFrame,
        ],
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> List[Optional[Union[Path, List[Path]]]]:
        """
        Download GHSL data for a specified geographic region.

        The region can be defined by a country code/name, a list of points,
        a Shapely geometry, or a GeoDataFrame. This method identifies the
        relevant GHSL tiles intersecting the region and downloads the
        specified type of data (polygons or points) for those tiles in parallel.

        Args:
            source: Defines the geographic area for which to download data.
                    Can be:
                      - A string representing a country code or name.
                      - A list of (latitude, longitude) tuples or Shapely Point objects.
                      - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                      - A GeoDataFrame with geometry column in EPSG:4326.
            extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional keyword arguments. These will be passed down to
                      `AdminBoundaries.create()` (if `source` is a country)
                      and to `self.download_data_units()`.

        Returns:
            A list of local file paths for the successfully downloaded tiles.
            Returns an empty list if no data is found for the region or if
            all downloads fail.
        """

        tiles = self.config.get_relevant_data_units(source, **kwargs)
        return self.download_data_units(
            tiles, extract=extract, file_pattern=file_pattern, **kwargs
        )

    def download_by_country(
        self,
        country_code: str,
        data_store: Optional[DataStore] = None,
        country_geom_path: Optional[Union[str, Path]] = None,
        extract: bool = True,
        file_pattern: Optional[str] = r".*\.tif$",
        **kwargs,
    ) -> List[Optional[Union[Path, List[Path]]]]:
        """
        Download GHSL data for a specific country.

        This is a convenience method to download data for an entire country
        using its code or name.

        Args:
            country_code: The country code (e.g., 'USA', 'GBR') or name.
            data_store: Optional instance of a `DataStore` to be used by
                        `AdminBoundaries` for loading country boundaries. If None,
                        `AdminBoundaries` will use its default data loading.
            country_geom_path: Optional path to a GeoJSON file containing the
                               country boundary. If provided, this boundary is used
                               instead of the default from `AdminBoundaries`.
            extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
            file_pattern: Optional regex pattern to filter extracted files (if extract=True).
            **kwargs: Additional keyword arguments that are passed to
                      `download_data_units`. For example, `extract` to download and extract.

        Returns:
            A list of local file paths for the successfully downloaded tiles
            for the specified country.
        """
        return self.download(
            source=country_code,
            data_store=data_store,
            path=country_geom_path,
            extract=extract,
            file_pattern=file_pattern,
            **kwargs,
        )

    def _download_file(self, url: str, output_path: Path) -> Optional[Path]:
        """
        Downloads a file from a URL to a specified output path with a progress bar.

        Args:
            url: The URL to download from.
            output_path: The local path to save the downloaded file.

        Returns:
            The path to the downloaded file on success, None on failure.
        """
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get("content-length", 0))

            with self.data_store.open(str(output_path), "wb") as file:
                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {output_path.name}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                            pbar.update(len(chunk))

            self.logger.debug(f"Successfully downloaded: {url} to {output_path}")
            return output_path

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download {url}: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading {url}: {str(e)}")
            return None
__init__(config, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Union[GHSLDataConfig, dict[str, Union[str, int]]]

Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/ghsl.py
def __init__(
    self,
    config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    config = (
        config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
    )
    super().__init__(config=config, data_store=data_store, logger=logger)
download(source, extract=True, file_pattern='.*\\.tif$', **kwargs)

Download GHSL data for a specified geographic region.

The region can be defined by a country code/name, a list of points, a Shapely geometry, or a GeoDataFrame. This method identifies the relevant GHSL tiles intersecting the region and downloads the specified type of data (polygons or points) for those tiles in parallel.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame]

Defines the geographic area for which to download data. Can be: - A string representing a country code or name. - A list of (latitude, longitude) tuples or Shapely Point objects. - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon). - A GeoDataFrame with geometry column in EPSG:4326.

required
extract bool

If True and the downloaded files are zips, extract their contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional keyword arguments. These will be passed down to AdminBoundaries.create() (if source is a country) and to self.download_data_units().

{}

Returns:

Type Description
List[Optional[Union[Path, List[Path]]]]

A list of local file paths for the successfully downloaded tiles.

List[Optional[Union[Path, List[Path]]]]

Returns an empty list if no data is found for the region or if

List[Optional[Union[Path, List[Path]]]]

all downloads fail.

Source code in gigaspatial/handlers/ghsl.py
def download(
    self,
    source: Union[
        str,  # country
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,  # shapely geoms
        gpd.GeoDataFrame,
    ],
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> List[Optional[Union[Path, List[Path]]]]:
    """
    Download GHSL data for a specified geographic region.

    The region can be defined by a country code/name, a list of points,
    a Shapely geometry, or a GeoDataFrame. This method identifies the
    relevant GHSL tiles intersecting the region and downloads the
    specified type of data (polygons or points) for those tiles in parallel.

    Args:
        source: Defines the geographic area for which to download data.
                Can be:
                  - A string representing a country code or name.
                  - A list of (latitude, longitude) tuples or Shapely Point objects.
                  - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                  - A GeoDataFrame with geometry column in EPSG:4326.
        extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional keyword arguments. These will be passed down to
                  `AdminBoundaries.create()` (if `source` is a country)
                  and to `self.download_data_units()`.

    Returns:
        A list of local file paths for the successfully downloaded tiles.
        Returns an empty list if no data is found for the region or if
        all downloads fail.
    """

    tiles = self.config.get_relevant_data_units(source, **kwargs)
    return self.download_data_units(
        tiles, extract=extract, file_pattern=file_pattern, **kwargs
    )
download_by_country(country_code, data_store=None, country_geom_path=None, extract=True, file_pattern='.*\\.tif$', **kwargs)

Download GHSL data for a specific country.

This is a convenience method to download data for an entire country using its code or name.

Parameters:

Name Type Description Default
country_code str

The country code (e.g., 'USA', 'GBR') or name.

required
data_store Optional[DataStore]

Optional instance of a DataStore to be used by AdminBoundaries for loading country boundaries. If None, AdminBoundaries will use its default data loading.

None
country_geom_path Optional[Union[str, Path]]

Optional path to a GeoJSON file containing the country boundary. If provided, this boundary is used instead of the default from AdminBoundaries.

None
extract bool

If True and the downloaded files are zips, extract their contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional keyword arguments that are passed to download_data_units. For example, extract to download and extract.

{}

Returns:

Type Description
List[Optional[Union[Path, List[Path]]]]

A list of local file paths for the successfully downloaded tiles

List[Optional[Union[Path, List[Path]]]]

for the specified country.

Source code in gigaspatial/handlers/ghsl.py
def download_by_country(
    self,
    country_code: str,
    data_store: Optional[DataStore] = None,
    country_geom_path: Optional[Union[str, Path]] = None,
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> List[Optional[Union[Path, List[Path]]]]:
    """
    Download GHSL data for a specific country.

    This is a convenience method to download data for an entire country
    using its code or name.

    Args:
        country_code: The country code (e.g., 'USA', 'GBR') or name.
        data_store: Optional instance of a `DataStore` to be used by
                    `AdminBoundaries` for loading country boundaries. If None,
                    `AdminBoundaries` will use its default data loading.
        country_geom_path: Optional path to a GeoJSON file containing the
                           country boundary. If provided, this boundary is used
                           instead of the default from `AdminBoundaries`.
        extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional keyword arguments that are passed to
                  `download_data_units`. For example, `extract` to download and extract.

    Returns:
        A list of local file paths for the successfully downloaded tiles
        for the specified country.
    """
    return self.download(
        source=country_code,
        data_store=data_store,
        path=country_geom_path,
        extract=extract,
        file_pattern=file_pattern,
        **kwargs,
    )
download_data_unit(tile_id, extract=True, file_pattern='.*\\.tif$', **kwargs)

Downloads and optionally extracts files for a given tile.

Parameters:

Name Type Description Default
tile_id str

tile ID to process.

required
extract bool

If True and the downloaded file is a zip, extract its contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional parameters passed to download methods

{}

Returns:

Type Description
Optional[Union[Path, List[Path]]]

Path to the downloaded file if extract=False,

Optional[Union[Path, List[Path]]]

List of paths to the extracted files if extract=True,

Optional[Union[Path, List[Path]]]

None on failure.

Source code in gigaspatial/handlers/ghsl.py
def download_data_unit(
    self,
    tile_id: str,
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> Optional[Union[Path, List[Path]]]:
    """
    Downloads and optionally extracts files for a given tile.

    Args:
        tile_id: tile ID to process.
        extract: If True and the downloaded file is a zip, extract its contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional parameters passed to download methods

    Returns:
        Path to the downloaded file if extract=False,
        List of paths to the extracted files if extract=True,
        None on failure.
    """
    url = self.config.compute_dataset_url(tile_id=tile_id)
    output_path = self.config.get_data_unit_path(tile_id)

    if not extract:
        return self._download_file(url, output_path)

    extracted_files: List[Path] = []
    temp_downloaded_path: Optional[Path] = None

    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_file:
            temp_downloaded_path = Path(temp_file.name)
            self.logger.debug(
                f"Downloading {url} to temporary file: {temp_downloaded_path}"
            )

            response = requests.get(url, stream=True)
            response.raise_for_status()

            total_size = int(response.headers.get("content-length", 0))

            with tqdm(
                total=total_size,
                unit="B",
                unit_scale=True,
                desc=f"Downloading {tile_id}",
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        temp_file.write(chunk)
                        pbar.update(len(chunk))

        self.logger.info(f"Successfully downloaded temporary file!")

        with zipfile.ZipFile(str(temp_downloaded_path), "r") as zip_ref:
            if file_pattern:
                import re

                pattern = re.compile(file_pattern)
                files_to_extract = [
                    f for f in zip_ref.namelist() if pattern.match(f)
                ]
            else:
                files_to_extract = zip_ref.namelist()

            for file in files_to_extract:
                extracted_path = output_path.parent / Path(file).name
                with zip_ref.open(file) as source:
                    file_content = source.read()
                    self.data_store.write_file(str(extracted_path), file_content)
                extracted_files.append(extracted_path)
                self.logger.info(f"Extracted {file} to {extracted_path}")

        Path(temp_file.name).unlink()
        return extracted_files

    except requests.exceptions.RequestException as e:
        self.logger.error(f"Failed to download {url} to temporary file: {e}")
        return None
    except zipfile.BadZipFile:
        self.logger.error(f"Downloaded file for {tile_id} is not a valid zip file.")
        return None
    except Exception as e:
        self.logger.error(f"Error downloading/extracting tile {tile_id}: {e}")
        return None
    finally:
        if temp_downloaded_path and temp_downloaded_path.exists():
            try:
                temp_downloaded_path.unlink()
                self.logger.debug(f"Deleted temporary file: {temp_downloaded_path}")
            except OSError as e:
                self.logger.warning(
                    f"Could not delete temporary file {temp_downloaded_path}: {e}"
                )
download_data_units(tile_ids, extract=True, file_pattern='.*\\.tif$', **kwargs)

Downloads multiple tiles in parallel, with an option to extract them.

Parameters:

Name Type Description Default
tile_ids List[str]

A list of tile IDs to download.

required
extract bool

If True and the downloaded files are zips, extract their contents. Defaults to True.

True
file_pattern Optional[str]

Optional regex pattern to filter extracted files (if extract=True).

'.*\\.tif$'
**kwargs

Additional parameters passed to download methods

{}

Returns:

Type Description
List[Optional[Union[Path, List[Path]]]]

A list where each element corresponds to a tile ID and contains:

List[Optional[Union[Path, List[Path]]]]
  • Path to the downloaded file if extract=False.
List[Optional[Union[Path, List[Path]]]]
  • List of paths to extracted files if extract=True.
List[Optional[Union[Path, List[Path]]]]
  • None if the download or extraction failed for a tile.
Source code in gigaspatial/handlers/ghsl.py
def download_data_units(
    self,
    tile_ids: List[str],
    extract: bool = True,
    file_pattern: Optional[str] = r".*\.tif$",
    **kwargs,
) -> List[Optional[Union[Path, List[Path]]]]:
    """
    Downloads multiple tiles in parallel, with an option to extract them.

    Args:
        tile_ids: A list of tile IDs to download.
        extract: If True and the downloaded files are zips, extract their contents. Defaults to True.
        file_pattern: Optional regex pattern to filter extracted files (if extract=True).
        **kwargs: Additional parameters passed to download methods

    Returns:
        A list where each element corresponds to a tile ID and contains:
        - Path to the downloaded file if extract=False.
        - List of paths to extracted files if extract=True.
        - None if the download or extraction failed for a tile.
    """
    if not tile_ids:
        self.logger.warning("No tiles to download")
        return []

    with multiprocessing.Pool(processes=self.config.n_workers) as pool:
        download_func = functools.partial(
            self.download_data_unit, extract=extract, file_pattern=file_pattern
        )
        file_paths = list(
            tqdm(
                pool.imap(download_func, tile_ids),
                total=len(tile_ids),
                desc=f"Downloading data",
            )
        )

    return file_paths

GHSLDataHandler

Bases: BaseHandler

Handler for GHSL (Global Human Settlement Layer) dataset.

This class provides a unified interface for downloading and loading GHSL data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/ghsl.py
class GHSLDataHandler(BaseHandler):
    """
    Handler for GHSL (Global Human Settlement Layer) dataset.

    This class provides a unified interface for downloading and loading GHSL data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def __init__(
        self,
        product: Literal[
            "GHS_BUILT_S",
            "GHS_BUILT_H_AGBH",
            "GHS_BUILT_H_ANBH",
            "GHS_BUILT_V",
            "GHS_POP",
            "GHS_SMOD",
        ],
        year: int = 2020,
        resolution: int = 100,
        config: Optional[GHSLDataConfig] = None,
        downloader: Optional[GHSLDataDownloader] = None,
        reader: Optional[GHSLDataReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        """
        Initialize the GHSLDataHandler.

        Args:
            product: The GHSL product to use. Must be one of:
                    - GHS_BUILT_S: Built-up surface
                    - GHS_BUILT_H_AGBH: Average building height
                    - GHS_BUILT_H_ANBH: Average number of building heights
                    - GHS_BUILT_V: Building volume
                    - GHS_POP: Population
                    - GHS_SMOD: Settlement model
            year: The year of the data (default: 2020)
            resolution: The resolution in meters (default: 100)
            config: Optional configuration object
            downloader: Optional downloader instance
            reader: Optional reader instance
            data_store: Optional data store instance
            logger: Optional logger instance
            **kwargs: Additional configuration parameters
        """
        self._product = product
        self._year = year
        self._resolution = resolution
        super().__init__(
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> GHSLDataConfig:
        """
        Create and return a GHSLDataConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured GHSLDataConfig instance
        """
        return GHSLDataConfig(
            product=self._product,
            year=self._year,
            resolution=self._resolution,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: GHSLDataConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GHSLDataDownloader:
        """
        Create and return a GHSLDataDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured GHSLDataDownloader instance
        """
        return GHSLDataDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: GHSLDataConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GHSLDataReader:
        """
        Create and return a GHSLDataReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured GHSLDataReader instance
        """
        return GHSLDataReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def load_data(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        ensure_available: bool = True,
        **kwargs,
    ):
        return super().load_data(
            source=source,
            ensure_available=ensure_available,
            file_ext=".tif",
            extract=True,
            file_pattern=r".*\.tif$",
            **kwargs,
        )

    def load_into_dataframe(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        ensure_available: bool = True,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Load GHSL data into a pandas DataFrame.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            DataFrame containing the GHSL data
        """
        tif_processors = self.load_data(
            source=source, ensure_available=ensure_available, **kwargs
        )
        return pd.concat(
            [tp.to_dataframe() for tp in tif_processors], ignore_index=True
        )

    def load_into_geodataframe(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        ensure_available: bool = True,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Load GHSL data into a geopandas GeoDataFrame.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing the GHSL data
        """
        tif_processors = self.load_data(
            source=source, ensure_available=ensure_available, **kwargs
        )
        return pd.concat(
            [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
        )
__init__(product, year=2020, resolution=100, config=None, downloader=None, reader=None, data_store=None, logger=None, **kwargs)

Initialize the GHSLDataHandler.

Parameters:

Name Type Description Default
product Literal['GHS_BUILT_S', 'GHS_BUILT_H_AGBH', 'GHS_BUILT_H_ANBH', 'GHS_BUILT_V', 'GHS_POP', 'GHS_SMOD']

The GHSL product to use. Must be one of: - GHS_BUILT_S: Built-up surface - GHS_BUILT_H_AGBH: Average building height - GHS_BUILT_H_ANBH: Average number of building heights - GHS_BUILT_V: Building volume - GHS_POP: Population - GHS_SMOD: Settlement model

required
year int

The year of the data (default: 2020)

2020
resolution int

The resolution in meters (default: 100)

100
config Optional[GHSLDataConfig]

Optional configuration object

None
downloader Optional[GHSLDataDownloader]

Optional downloader instance

None
reader Optional[GHSLDataReader]

Optional reader instance

None
data_store Optional[DataStore]

Optional data store instance

None
logger Optional[Logger]

Optional logger instance

None
**kwargs

Additional configuration parameters

{}
Source code in gigaspatial/handlers/ghsl.py
def __init__(
    self,
    product: Literal[
        "GHS_BUILT_S",
        "GHS_BUILT_H_AGBH",
        "GHS_BUILT_H_ANBH",
        "GHS_BUILT_V",
        "GHS_POP",
        "GHS_SMOD",
    ],
    year: int = 2020,
    resolution: int = 100,
    config: Optional[GHSLDataConfig] = None,
    downloader: Optional[GHSLDataDownloader] = None,
    reader: Optional[GHSLDataReader] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
    **kwargs,
):
    """
    Initialize the GHSLDataHandler.

    Args:
        product: The GHSL product to use. Must be one of:
                - GHS_BUILT_S: Built-up surface
                - GHS_BUILT_H_AGBH: Average building height
                - GHS_BUILT_H_ANBH: Average number of building heights
                - GHS_BUILT_V: Building volume
                - GHS_POP: Population
                - GHS_SMOD: Settlement model
        year: The year of the data (default: 2020)
        resolution: The resolution in meters (default: 100)
        config: Optional configuration object
        downloader: Optional downloader instance
        reader: Optional reader instance
        data_store: Optional data store instance
        logger: Optional logger instance
        **kwargs: Additional configuration parameters
    """
    self._product = product
    self._year = year
    self._resolution = resolution
    super().__init__(
        config=config,
        downloader=downloader,
        reader=reader,
        data_store=data_store,
        logger=logger,
    )
create_config(data_store, logger, **kwargs)

Create and return a GHSLDataConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
GHSLDataConfig

Configured GHSLDataConfig instance

Source code in gigaspatial/handlers/ghsl.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> GHSLDataConfig:
    """
    Create and return a GHSLDataConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured GHSLDataConfig instance
    """
    return GHSLDataConfig(
        product=self._product,
        year=self._year,
        resolution=self._resolution,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a GHSLDataDownloader instance.

Parameters:

Name Type Description Default
config GHSLDataConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
GHSLDataDownloader

Configured GHSLDataDownloader instance

Source code in gigaspatial/handlers/ghsl.py
def create_downloader(
    self,
    config: GHSLDataConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GHSLDataDownloader:
    """
    Create and return a GHSLDataDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured GHSLDataDownloader instance
    """
    return GHSLDataDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a GHSLDataReader instance.

Parameters:

Name Type Description Default
config GHSLDataConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
GHSLDataReader

Configured GHSLDataReader instance

Source code in gigaspatial/handlers/ghsl.py
def create_reader(
    self,
    config: GHSLDataConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GHSLDataReader:
    """
    Create and return a GHSLDataReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured GHSLDataReader instance
    """
    return GHSLDataReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
load_into_dataframe(source, ensure_available=True, **kwargs)

Load GHSL data into a pandas DataFrame.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
DataFrame

DataFrame containing the GHSL data

Source code in gigaspatial/handlers/ghsl.py
def load_into_dataframe(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    ensure_available: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """
    Load GHSL data into a pandas DataFrame.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        DataFrame containing the GHSL data
    """
    tif_processors = self.load_data(
        source=source, ensure_available=ensure_available, **kwargs
    )
    return pd.concat(
        [tp.to_dataframe() for tp in tif_processors], ignore_index=True
    )
load_into_geodataframe(source, ensure_available=True, **kwargs)

Load GHSL data into a geopandas GeoDataFrame.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
DataFrame

GeoDataFrame containing the GHSL data

Source code in gigaspatial/handlers/ghsl.py
def load_into_geodataframe(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    ensure_available: bool = True,
    **kwargs,
) -> pd.DataFrame:
    """
    Load GHSL data into a geopandas GeoDataFrame.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing the GHSL data
    """
    tif_processors = self.load_data(
        source=source, ensure_available=ensure_available, **kwargs
    )
    return pd.concat(
        [tp.to_geodataframe() for tp in tif_processors], ignore_index=True
    )

GHSLDataReader

Bases: BaseHandlerReader

Source code in gigaspatial/handlers/ghsl.py
class GHSLDataReader(BaseHandlerReader):

    def __init__(
        self,
        config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        config = (
            config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
        )
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> List[TifProcessor]:
        """
        Load TifProcessors from GHSL dataset.
        Args:
            source_data_path: List of file paths to load
        Returns:
            List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
        """
        return self._load_raster_data(raster_paths=source_data_path)

    def load(self, source, **kwargs):
        return super().load(source=source, file_ext=".tif")
__init__(config, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Union[GHSLDataConfig, dict[str, Union[str, int]]]

Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/ghsl.py
def __init__(
    self,
    config: Union[GHSLDataConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Configuration for the GHSL dataset, either as a GHSLDataConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    config = (
        config if isinstance(config, GHSLDataConfig) else GHSLDataConfig(**config)
    )
    super().__init__(config=config, data_store=data_store, logger=logger)
load_from_paths(source_data_path, **kwargs)

Load TifProcessors from GHSL dataset. Args: source_data_path: List of file paths to load Returns: List[TifProcessor]: List of TifProcessor objects for accessing the raster data.

Source code in gigaspatial/handlers/ghsl.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> List[TifProcessor]:
    """
    Load TifProcessors from GHSL dataset.
    Args:
        source_data_path: List of file paths to load
    Returns:
        List[TifProcessor]: List of TifProcessor objects for accessing the raster data.
    """
    return self._load_raster_data(raster_paths=source_data_path)

giga

GigaSchoolLocationFetcher

Fetch and process school location data from the Giga School Geolocation Data API.

Source code in gigaspatial/handlers/giga.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GigaSchoolLocationFetcher:
    """
    Fetch and process school location data from the Giga School Geolocation Data API.
    """

    country: str = Field(...)
    api_url: str = Field(
        default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_location/country/{isocode3}",
        description="Base URL for the Giga School API",
    )
    api_key: str = global_config.GIGA_SCHOOL_LOCATION_API_KEY
    page_size: int = Field(default=1000, description="Number of records per API page")
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )

    logger: logging.Logger = Field(default=None, repr=False)

    def __post_init__(self):
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")
        self.api_url = self.api_url.format(isocode3=self.country)
        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def fetch_locations(self, **kwargs) -> pd.DataFrame:
        """
        Fetch and process school locations.

        Args:
            **kwargs: Additional parameters for customization
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch

        Returns:
            pd.DataFrame: School locations with geospatial info.
        """
        # Override defaults with kwargs if provided
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)

        # Prepare headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json",
        }

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch school locations for country: {self.country}"
        )

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            params = {"page": page, "size": page_size}

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, headers=headers, params=params)
                response.raise_for_status()

                parsed = response.json()
                data = parsed.get("data", [])

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data:
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame and process
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)

        df = self._process_geospatial_data(df)

        return df

    def _process_geospatial_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Process and enhance the DataFrame with geospatial information.

        Args:
            df: Raw DataFrame from API

        Returns:
            pd.DataFrame: Enhanced DataFrame with geospatial data
        """
        if df.empty:
            return df

        df["geometry"] = df.apply(
            lambda row: Point(row["longitude"], row["latitude"]), axis=1
        )
        self.logger.info(f"Created geometry for all {len(df)} records")

        return df
fetch_locations(**kwargs)

Fetch and process school locations.

Parameters:

Name Type Description Default
**kwargs

Additional parameters for customization - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch

{}

Returns:

Type Description
DataFrame

pd.DataFrame: School locations with geospatial info.

Source code in gigaspatial/handlers/giga.py
def fetch_locations(self, **kwargs) -> pd.DataFrame:
    """
    Fetch and process school locations.

    Args:
        **kwargs: Additional parameters for customization
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch

    Returns:
        pd.DataFrame: School locations with geospatial info.
    """
    # Override defaults with kwargs if provided
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)

    # Prepare headers
    headers = {
        "Authorization": f"Bearer {self.api_key}",
        "Accept": "application/json",
    }

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch school locations for country: {self.country}"
    )

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        params = {"page": page, "size": page_size}

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, headers=headers, params=params)
            response.raise_for_status()

            parsed = response.json()
            data = parsed.get("data", [])

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data:
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame and process
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(all_data)

    df = self._process_geospatial_data(df)

    return df

GigaSchoolMeasurementsFetcher

Fetch and process school daily realtime connectivity measurements from the Giga API. This includes download/upload speeds, latency, and connectivity performance data.

Source code in gigaspatial/handlers/giga.py
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GigaSchoolMeasurementsFetcher:
    """
    Fetch and process school daily realtime connectivity measurements from the Giga API.
    This includes download/upload speeds, latency, and connectivity performance data.
    """

    country: str = Field(...)
    start_date: Union[str, date, datetime] = Field(...)
    end_date: Union[str, date, datetime] = Field(...)
    api_url: str = Field(
        default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/all_measurements",
        description="Base URL for the Giga School Measurements API",
    )
    api_key: str = global_config.GIGA_SCHOOL_MEASUREMENTS_API_KEY
    page_size: int = Field(default=1000, description="Number of records per API page")
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )
    giga_id_school: Optional[str] = Field(
        default=None, description="Optional specific giga school ID to fetch"
    )

    logger: logging.Logger = Field(default=None, repr=False)

    def __post_init__(self):
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        # Convert dates to string format if needed
        self.start_date = self._format_date(self.start_date)
        self.end_date = self._format_date(self.end_date)

        # Validate date range
        if self.start_date > self.end_date:
            raise ValueError("start_date must be before or equal to end_date")

        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def _format_date(self, date_input: Union[str, date, datetime]) -> str:
        """
        Convert date input to string format expected by API (YYYY-MM-DD).

        Args:
            date_input: Date in various formats

        Returns:
            str: Date in YYYY-MM-DD format
        """
        if isinstance(date_input, str):
            # Assume it's already in correct format or parse it
            try:
                parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
                return date_input
            except ValueError:
                try:
                    parsed_date = pd.to_datetime(date_input)
                    return parsed_date.strftime("%Y-%m-%d")
                except:
                    raise ValueError(
                        f"Invalid date format: {date_input}. Expected YYYY-MM-DD"
                    )
        elif isinstance(date_input, (date, datetime)):
            return date_input.strftime("%Y-%m-%d")
        else:
            raise ValueError(f"Invalid date type: {type(date_input)}")

    def fetch_measurements(self, **kwargs) -> pd.DataFrame:
        """
        Fetch and process school connectivity measurements.

        Args:
            **kwargs: Additional parameters for customization
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch
                - giga_id_school: Override default giga_id_school filter
                - start_date: Override default start_date
                - end_date: Override default end_date

        Returns:
            pd.DataFrame: School measurements with connectivity performance data.
        """
        # Override defaults with kwargs if provided
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)
        giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
        start_date = kwargs.get("start_date", self.start_date)
        end_date = kwargs.get("end_date", self.end_date)

        # Format dates if overridden
        if start_date != self.start_date:
            start_date = self._format_date(start_date)
        if end_date != self.end_date:
            end_date = self._format_date(end_date)

        # Prepare headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json",
        }

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch measurements for country: {self.country} "
            f"from {start_date} to {end_date}"
        )

        if giga_id_school:
            self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            # Build parameters
            params = {
                "country_iso3_code": self.country,
                "start_date": start_date,
                "end_date": end_date,
                "page": page,
                "size": page_size,
            }

            # Add giga_id_school filter if specified
            if giga_id_school:
                params["giga_id_school"] = giga_id_school

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, headers=headers, params=params)
                response.raise_for_status()

                parsed = response.json()
                data = parsed.get("data", [])

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data:
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            # If filtering by specific school ID, we might only need one page
            if giga_id_school and len(all_data) > 0:
                self.logger.info(
                    "Specific school ID requested, checking if more data needed"
                )

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame and process
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)
        df = self._process_measurements_data(df)

        return df

    def _process_measurements_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Process and enhance the DataFrame with measurement performance metrics.

        Args:
            df: Raw DataFrame from API

        Returns:
            pd.DataFrame: Enhanced DataFrame with processed measurement data
        """
        if df.empty:
            return df

        # Convert date column to datetime
        if "date" in df.columns:
            df["date"] = pd.to_datetime(df["date"], errors="coerce")
            df["date_only"] = df["date"].dt.date
            df["year"] = df["date"].dt.year
            df["month"] = df["date"].dt.month
            df["day_of_week"] = df["date"].dt.day_name()
            self.logger.info("Processed date fields")

        # Process speed measurements
        numeric_columns = ["download_speed", "upload_speed", "latency"]
        for col in numeric_columns:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        # Create performance categories
        if "download_speed" in df.columns:
            df["download_speed_category"] = pd.cut(
                df["download_speed"],
                bins=[0, 5, 25, 100, float("inf")],
                labels=[
                    "Very Slow (<5 Mbps)",
                    "Slow (5-25 Mbps)",
                    "Moderate (25-100 Mbps)",
                    "Fast (>100 Mbps)",
                ],
                include_lowest=True,
            )

        if "upload_speed" in df.columns:
            df["upload_speed_category"] = pd.cut(
                df["upload_speed"],
                bins=[0, 1, 10, 50, float("inf")],
                labels=[
                    "Very Slow (<1 Mbps)",
                    "Slow (1-10 Mbps)",
                    "Moderate (10-50 Mbps)",
                    "Fast (>50 Mbps)",
                ],
                include_lowest=True,
            )

        if "latency" in df.columns:
            df["latency_category"] = pd.cut(
                df["latency"],
                bins=[0, 50, 150, 300, float("inf")],
                labels=[
                    "Excellent (<50ms)",
                    "Good (50-150ms)",
                    "Fair (150-300ms)",
                    "Poor (>300ms)",
                ],
                include_lowest=True,
            )

        # Create quality flags
        if "download_speed" in df.columns and "upload_speed" in df.columns:
            df["has_broadband"] = (df["download_speed"] >= 25) & (
                df["upload_speed"] >= 3
            )
            df["has_basic_connectivity"] = (df["download_speed"] >= 1) & (
                df["upload_speed"] >= 0.5
            )

        # Flag measurements with missing data
        df["has_complete_measurement"] = (
            df["download_speed"].notna()
            & df["upload_speed"].notna()
            & df["latency"].notna()
        )

        self.logger.info(f"Processed measurement data for {len(df)} records")

        return df

    def get_performance_summary(self, df: pd.DataFrame) -> dict:
        """
        Generate a comprehensive summary of connectivity performance metrics.

        Args:
            df: DataFrame with measurement data

        Returns:
            dict: Summary statistics about connectivity performance
        """
        if df.empty:
            return {"error": "No data available"}

        summary = {
            "total_measurements": len(df),
            "country": (
                df["country_iso3_code"].iloc[0]
                if "country_iso3_code" in df.columns
                else "Unknown"
            ),
            "date_range": {
                "start": (
                    df["date"].min().strftime("%Y-%m-%d")
                    if "date" in df.columns
                    else None
                ),
                "end": (
                    df["date"].max().strftime("%Y-%m-%d")
                    if "date" in df.columns
                    else None
                ),
            },
        }

        # School coverage
        if "giga_id_school" in df.columns:
            unique_schools = df["giga_id_school"].nunique()
            summary["unique_schools_measured"] = unique_schools
            summary["avg_measurements_per_school"] = (
                len(df) / unique_schools if unique_schools > 0 else 0
            )

        # Speed statistics
        for speed_col in ["download_speed", "upload_speed"]:
            if speed_col in df.columns:
                speed_data = df[speed_col].dropna()
                if len(speed_data) > 0:
                    summary[f"{speed_col}_stats"] = {
                        "mean": float(speed_data.mean()),
                        "median": float(speed_data.median()),
                        "min": float(speed_data.min()),
                        "max": float(speed_data.max()),
                        "std": float(speed_data.std()),
                    }

        # Latency statistics
        if "latency" in df.columns:
            latency_data = df["latency"].dropna()
            if len(latency_data) > 0:
                summary["latency_stats"] = {
                    "mean": float(latency_data.mean()),
                    "median": float(latency_data.median()),
                    "min": float(latency_data.min()),
                    "max": float(latency_data.max()),
                    "std": float(latency_data.std()),
                }

        # Performance categories
        for cat_col in [
            "download_speed_category",
            "upload_speed_category",
            "latency_category",
        ]:
            if cat_col in df.columns:
                cat_counts = df[cat_col].value_counts().to_dict()
                summary[cat_col.replace("_category", "_breakdown")] = cat_counts

        # Quality metrics
        if "has_broadband" in df.columns:
            summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
            summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)

        if "has_basic_connectivity" in df.columns:
            summary["basic_connectivity_measurements"] = int(
                df["has_basic_connectivity"].sum()
            )
            summary["basic_connectivity_percentage"] = float(
                df["has_basic_connectivity"].mean() * 100
            )

        # Data completeness
        if "has_complete_measurement" in df.columns:
            summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
            summary["data_completeness_percentage"] = float(
                df["has_complete_measurement"].mean() * 100
            )

        # Data sources
        if "data_source" in df.columns:
            source_counts = df["data_source"].value_counts().to_dict()
            summary["data_sources"] = source_counts

        # Temporal patterns
        if "day_of_week" in df.columns:
            day_counts = df["day_of_week"].value_counts().to_dict()
            summary["measurements_by_day_of_week"] = day_counts

        self.logger.info("Generated performance summary")
        return summary

    def get_school_performance_comparison(
        self, df: pd.DataFrame, top_n: int = 10
    ) -> dict:
        """
        Compare performance across schools.

        Args:
            df: DataFrame with measurement data
            top_n: Number of top/bottom schools to include

        Returns:
            dict: School performance comparison
        """
        if df.empty or "giga_id_school" not in df.columns:
            return {"error": "No school data available"}

        school_stats = (
            df.groupby("giga_id_school")
            .agg(
                {
                    "download_speed": ["mean", "median", "count"],
                    "upload_speed": ["mean", "median"],
                    "latency": ["mean", "median"],
                    "has_broadband": (
                        "mean" if "has_broadband" in df.columns else lambda x: None
                    ),
                }
            )
            .round(2)
        )

        # Flatten column names
        school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]

        # Sort by download speed
        if "download_speed_mean" in school_stats.columns:
            top_schools = school_stats.nlargest(top_n, "download_speed_mean")
            bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")

            return {
                "top_performing_schools": top_schools.to_dict("index"),
                "bottom_performing_schools": bottom_schools.to_dict("index"),
                "total_schools_analyzed": len(school_stats),
            }

        return {"error": "Insufficient data for school comparison"}
fetch_measurements(**kwargs)

Fetch and process school connectivity measurements.

Parameters:

Name Type Description Default
**kwargs

Additional parameters for customization - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch - giga_id_school: Override default giga_id_school filter - start_date: Override default start_date - end_date: Override default end_date

{}

Returns:

Type Description
DataFrame

pd.DataFrame: School measurements with connectivity performance data.

Source code in gigaspatial/handlers/giga.py
def fetch_measurements(self, **kwargs) -> pd.DataFrame:
    """
    Fetch and process school connectivity measurements.

    Args:
        **kwargs: Additional parameters for customization
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch
            - giga_id_school: Override default giga_id_school filter
            - start_date: Override default start_date
            - end_date: Override default end_date

    Returns:
        pd.DataFrame: School measurements with connectivity performance data.
    """
    # Override defaults with kwargs if provided
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)
    giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)
    start_date = kwargs.get("start_date", self.start_date)
    end_date = kwargs.get("end_date", self.end_date)

    # Format dates if overridden
    if start_date != self.start_date:
        start_date = self._format_date(start_date)
    if end_date != self.end_date:
        end_date = self._format_date(end_date)

    # Prepare headers
    headers = {
        "Authorization": f"Bearer {self.api_key}",
        "Accept": "application/json",
    }

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch measurements for country: {self.country} "
        f"from {start_date} to {end_date}"
    )

    if giga_id_school:
        self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        # Build parameters
        params = {
            "country_iso3_code": self.country,
            "start_date": start_date,
            "end_date": end_date,
            "page": page,
            "size": page_size,
        }

        # Add giga_id_school filter if specified
        if giga_id_school:
            params["giga_id_school"] = giga_id_school

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, headers=headers, params=params)
            response.raise_for_status()

            parsed = response.json()
            data = parsed.get("data", [])

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data:
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        # If filtering by specific school ID, we might only need one page
        if giga_id_school and len(all_data) > 0:
            self.logger.info(
                "Specific school ID requested, checking if more data needed"
            )

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame and process
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(all_data)
    df = self._process_measurements_data(df)

    return df
get_performance_summary(df)

Generate a comprehensive summary of connectivity performance metrics.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with measurement data

required

Returns:

Name Type Description
dict dict

Summary statistics about connectivity performance

Source code in gigaspatial/handlers/giga.py
def get_performance_summary(self, df: pd.DataFrame) -> dict:
    """
    Generate a comprehensive summary of connectivity performance metrics.

    Args:
        df: DataFrame with measurement data

    Returns:
        dict: Summary statistics about connectivity performance
    """
    if df.empty:
        return {"error": "No data available"}

    summary = {
        "total_measurements": len(df),
        "country": (
            df["country_iso3_code"].iloc[0]
            if "country_iso3_code" in df.columns
            else "Unknown"
        ),
        "date_range": {
            "start": (
                df["date"].min().strftime("%Y-%m-%d")
                if "date" in df.columns
                else None
            ),
            "end": (
                df["date"].max().strftime("%Y-%m-%d")
                if "date" in df.columns
                else None
            ),
        },
    }

    # School coverage
    if "giga_id_school" in df.columns:
        unique_schools = df["giga_id_school"].nunique()
        summary["unique_schools_measured"] = unique_schools
        summary["avg_measurements_per_school"] = (
            len(df) / unique_schools if unique_schools > 0 else 0
        )

    # Speed statistics
    for speed_col in ["download_speed", "upload_speed"]:
        if speed_col in df.columns:
            speed_data = df[speed_col].dropna()
            if len(speed_data) > 0:
                summary[f"{speed_col}_stats"] = {
                    "mean": float(speed_data.mean()),
                    "median": float(speed_data.median()),
                    "min": float(speed_data.min()),
                    "max": float(speed_data.max()),
                    "std": float(speed_data.std()),
                }

    # Latency statistics
    if "latency" in df.columns:
        latency_data = df["latency"].dropna()
        if len(latency_data) > 0:
            summary["latency_stats"] = {
                "mean": float(latency_data.mean()),
                "median": float(latency_data.median()),
                "min": float(latency_data.min()),
                "max": float(latency_data.max()),
                "std": float(latency_data.std()),
            }

    # Performance categories
    for cat_col in [
        "download_speed_category",
        "upload_speed_category",
        "latency_category",
    ]:
        if cat_col in df.columns:
            cat_counts = df[cat_col].value_counts().to_dict()
            summary[cat_col.replace("_category", "_breakdown")] = cat_counts

    # Quality metrics
    if "has_broadband" in df.columns:
        summary["broadband_capable_measurements"] = int(df["has_broadband"].sum())
        summary["broadband_percentage"] = float(df["has_broadband"].mean() * 100)

    if "has_basic_connectivity" in df.columns:
        summary["basic_connectivity_measurements"] = int(
            df["has_basic_connectivity"].sum()
        )
        summary["basic_connectivity_percentage"] = float(
            df["has_basic_connectivity"].mean() * 100
        )

    # Data completeness
    if "has_complete_measurement" in df.columns:
        summary["complete_measurements"] = int(df["has_complete_measurement"].sum())
        summary["data_completeness_percentage"] = float(
            df["has_complete_measurement"].mean() * 100
        )

    # Data sources
    if "data_source" in df.columns:
        source_counts = df["data_source"].value_counts().to_dict()
        summary["data_sources"] = source_counts

    # Temporal patterns
    if "day_of_week" in df.columns:
        day_counts = df["day_of_week"].value_counts().to_dict()
        summary["measurements_by_day_of_week"] = day_counts

    self.logger.info("Generated performance summary")
    return summary
get_school_performance_comparison(df, top_n=10)

Compare performance across schools.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with measurement data

required
top_n int

Number of top/bottom schools to include

10

Returns:

Name Type Description
dict dict

School performance comparison

Source code in gigaspatial/handlers/giga.py
def get_school_performance_comparison(
    self, df: pd.DataFrame, top_n: int = 10
) -> dict:
    """
    Compare performance across schools.

    Args:
        df: DataFrame with measurement data
        top_n: Number of top/bottom schools to include

    Returns:
        dict: School performance comparison
    """
    if df.empty or "giga_id_school" not in df.columns:
        return {"error": "No school data available"}

    school_stats = (
        df.groupby("giga_id_school")
        .agg(
            {
                "download_speed": ["mean", "median", "count"],
                "upload_speed": ["mean", "median"],
                "latency": ["mean", "median"],
                "has_broadband": (
                    "mean" if "has_broadband" in df.columns else lambda x: None
                ),
            }
        )
        .round(2)
    )

    # Flatten column names
    school_stats.columns = ["_".join(col).strip() for col in school_stats.columns]

    # Sort by download speed
    if "download_speed_mean" in school_stats.columns:
        top_schools = school_stats.nlargest(top_n, "download_speed_mean")
        bottom_schools = school_stats.nsmallest(top_n, "download_speed_mean")

        return {
            "top_performing_schools": top_schools.to_dict("index"),
            "bottom_performing_schools": bottom_schools.to_dict("index"),
            "total_schools_analyzed": len(school_stats),
        }

    return {"error": "Insufficient data for school comparison"}

GigaSchoolProfileFetcher

Fetch and process school profile data from the Giga School Profile API. This includes connectivity information and other school details.

Source code in gigaspatial/handlers/giga.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class GigaSchoolProfileFetcher:
    """
    Fetch and process school profile data from the Giga School Profile API.
    This includes connectivity information and other school details.
    """

    country: str = Field(...)
    api_url: str = Field(
        default="https://uni-ooi-giga-maps-service.azurewebsites.net/api/v1/schools_profile/",
        description="Base URL for the Giga School Profile API",
    )
    api_key: str = global_config.GIGA_SCHOOL_PROFILE_API_KEY
    page_size: int = Field(default=1000, description="Number of records per API page")
    sleep_time: float = Field(
        default=0.2, description="Sleep time between API requests"
    )
    giga_id_school: Optional[str] = Field(
        default=None, description="Optional specific giga school ID to fetch"
    )

    logger: logging.Logger = Field(default=None, repr=False)

    def __post_init__(self):
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        if self.logger is None:
            self.logger = global_config.get_logger(self.__class__.__name__)

    def fetch_profiles(self, **kwargs) -> pd.DataFrame:
        """
        Fetch and process school profiles including connectivity information.

        Args:
            **kwargs: Additional parameters for customization
                - page_size: Override default page size
                - sleep_time: Override default sleep time between requests
                - max_pages: Limit the number of pages to fetch
                - giga_id_school: Override default giga_id_school filter

        Returns:
            pd.DataFrame: School profiles with connectivity and geospatial info.
        """
        # Override defaults with kwargs if provided
        page_size = kwargs.get("page_size", self.page_size)
        sleep_time = kwargs.get("sleep_time", self.sleep_time)
        max_pages = kwargs.get("max_pages", None)
        giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)

        # Prepare headers
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Accept": "application/json",
        }

        all_data = []
        page = 1

        self.logger.info(
            f"Starting to fetch school profiles for country: {self.country}"
        )

        if giga_id_school:
            self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

        while True:
            # Check if we've reached max_pages limit
            if max_pages and page > max_pages:
                self.logger.info(f"Reached maximum pages limit: {max_pages}")
                break

            # Build parameters
            params = {
                "country_iso3_code": self.country,
                "page": page,
                "size": page_size,
            }

            # Add giga_id_school filter if specified
            if giga_id_school:
                params["giga_id_school"] = giga_id_school

            try:
                self.logger.debug(f"Fetching page {page} with params: {params}")
                response = requests.get(self.api_url, headers=headers, params=params)
                response.raise_for_status()

                parsed = response.json()
                data = parsed.get("data", [])

            except requests.exceptions.RequestException as e:
                self.logger.error(f"Request failed on page {page}: {e}")
                break
            except ValueError as e:
                self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
                break

            # Check if we got any data
            if not data:
                self.logger.info(f"No data on page {page}. Stopping.")
                break

            all_data.extend(data)
            self.logger.info(f"Fetched page {page} with {len(data)} records")

            # If we got fewer records than page_size, we've reached the end
            if len(data) < page_size:
                self.logger.info("Reached end of data (partial page received)")
                break

            # If filtering by specific school ID, we likely only need one page
            if giga_id_school:
                self.logger.info(
                    "Specific school ID requested, stopping after first page"
                )
                break

            page += 1

            # Sleep to be respectful to the API
            if sleep_time > 0:
                time.sleep(sleep_time)

        self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

        # Convert to DataFrame and process
        if not all_data:
            self.logger.warning("No data fetched, returning empty DataFrame")
            return pd.DataFrame()

        df = pd.DataFrame(all_data)

        return df

    def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
        """
        Generate a summary of connectivity statistics from the fetched data.

        Args:
            df: DataFrame with school profile data

        Returns:
            dict: Summary statistics about connectivity
        """
        if df.empty:
            return {"error": "No data available"}

        summary = {
            "total_schools": len(df),
            "country": (
                df["country_iso3_code"].iloc[0]
                if "country_iso3_code" in df.columns
                else "Unknown"
            ),
        }

        # Administrative region analysis
        if "admin1" in df.columns:
            admin1_counts = df["admin1"].value_counts().head(10).to_dict()
            summary["top_admin1_regions"] = admin1_counts

        if "admin2" in df.columns:
            admin2_counts = df["admin2"].value_counts().head(10).to_dict()
            summary["top_admin2_regions"] = admin2_counts

        # Connectivity analysis
        if "connectivity" in df.columns:
            connected_count = df["connectivity"].sum()
            summary["schools_with_connectivity"] = int(connected_count)
            summary["connectivity_percentage"] = connected_count / len(df) * 100

        if "connectivity_RT" in df.columns:
            rt_connected_count = df["connectivity_RT"].sum()
            summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
            summary["realtime_connectivity_percentage"] = (
                rt_connected_count / len(df) * 100
            )

        # Connectivity type analysis
        if "connectivity_type" in df.columns:

            if not all(df.connectivity_type.isna()):
                from collections import Counter

                type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
                summary["connectivity_types_breakdown"] = type_counts

        # Data source analysis
        if "connectivity_RT_datasource" in df.columns:
            datasource_counts = (
                df["connectivity_RT_datasource"].value_counts().to_dict()
            )
            summary["realtime_connectivity_datasources"] = datasource_counts

        if "school_data_source" in df.columns:
            school_datasource_counts = df["school_data_source"].value_counts().to_dict()
            summary["school_data_sources"] = school_datasource_counts

        self.logger.info("Generated connectivity summary")
        return summary
fetch_profiles(**kwargs)

Fetch and process school profiles including connectivity information.

Parameters:

Name Type Description Default
**kwargs

Additional parameters for customization - page_size: Override default page size - sleep_time: Override default sleep time between requests - max_pages: Limit the number of pages to fetch - giga_id_school: Override default giga_id_school filter

{}

Returns:

Type Description
DataFrame

pd.DataFrame: School profiles with connectivity and geospatial info.

Source code in gigaspatial/handlers/giga.py
def fetch_profiles(self, **kwargs) -> pd.DataFrame:
    """
    Fetch and process school profiles including connectivity information.

    Args:
        **kwargs: Additional parameters for customization
            - page_size: Override default page size
            - sleep_time: Override default sleep time between requests
            - max_pages: Limit the number of pages to fetch
            - giga_id_school: Override default giga_id_school filter

    Returns:
        pd.DataFrame: School profiles with connectivity and geospatial info.
    """
    # Override defaults with kwargs if provided
    page_size = kwargs.get("page_size", self.page_size)
    sleep_time = kwargs.get("sleep_time", self.sleep_time)
    max_pages = kwargs.get("max_pages", None)
    giga_id_school = kwargs.get("giga_id_school", self.giga_id_school)

    # Prepare headers
    headers = {
        "Authorization": f"Bearer {self.api_key}",
        "Accept": "application/json",
    }

    all_data = []
    page = 1

    self.logger.info(
        f"Starting to fetch school profiles for country: {self.country}"
    )

    if giga_id_school:
        self.logger.info(f"Filtering for specific school ID: {giga_id_school}")

    while True:
        # Check if we've reached max_pages limit
        if max_pages and page > max_pages:
            self.logger.info(f"Reached maximum pages limit: {max_pages}")
            break

        # Build parameters
        params = {
            "country_iso3_code": self.country,
            "page": page,
            "size": page_size,
        }

        # Add giga_id_school filter if specified
        if giga_id_school:
            params["giga_id_school"] = giga_id_school

        try:
            self.logger.debug(f"Fetching page {page} with params: {params}")
            response = requests.get(self.api_url, headers=headers, params=params)
            response.raise_for_status()

            parsed = response.json()
            data = parsed.get("data", [])

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Request failed on page {page}: {e}")
            break
        except ValueError as e:
            self.logger.error(f"Failed to parse JSON response on page {page}: {e}")
            break

        # Check if we got any data
        if not data:
            self.logger.info(f"No data on page {page}. Stopping.")
            break

        all_data.extend(data)
        self.logger.info(f"Fetched page {page} with {len(data)} records")

        # If we got fewer records than page_size, we've reached the end
        if len(data) < page_size:
            self.logger.info("Reached end of data (partial page received)")
            break

        # If filtering by specific school ID, we likely only need one page
        if giga_id_school:
            self.logger.info(
                "Specific school ID requested, stopping after first page"
            )
            break

        page += 1

        # Sleep to be respectful to the API
        if sleep_time > 0:
            time.sleep(sleep_time)

    self.logger.info(f"Finished fetching. Total records: {len(all_data)}")

    # Convert to DataFrame and process
    if not all_data:
        self.logger.warning("No data fetched, returning empty DataFrame")
        return pd.DataFrame()

    df = pd.DataFrame(all_data)

    return df
get_connectivity_summary(df)

Generate a summary of connectivity statistics from the fetched data.

Parameters:

Name Type Description Default
df DataFrame

DataFrame with school profile data

required

Returns:

Name Type Description
dict dict

Summary statistics about connectivity

Source code in gigaspatial/handlers/giga.py
def get_connectivity_summary(self, df: pd.DataFrame) -> dict:
    """
    Generate a summary of connectivity statistics from the fetched data.

    Args:
        df: DataFrame with school profile data

    Returns:
        dict: Summary statistics about connectivity
    """
    if df.empty:
        return {"error": "No data available"}

    summary = {
        "total_schools": len(df),
        "country": (
            df["country_iso3_code"].iloc[0]
            if "country_iso3_code" in df.columns
            else "Unknown"
        ),
    }

    # Administrative region analysis
    if "admin1" in df.columns:
        admin1_counts = df["admin1"].value_counts().head(10).to_dict()
        summary["top_admin1_regions"] = admin1_counts

    if "admin2" in df.columns:
        admin2_counts = df["admin2"].value_counts().head(10).to_dict()
        summary["top_admin2_regions"] = admin2_counts

    # Connectivity analysis
    if "connectivity" in df.columns:
        connected_count = df["connectivity"].sum()
        summary["schools_with_connectivity"] = int(connected_count)
        summary["connectivity_percentage"] = connected_count / len(df) * 100

    if "connectivity_RT" in df.columns:
        rt_connected_count = df["connectivity_RT"].sum()
        summary["schools_with_realtime_connectivity"] = int(rt_connected_count)
        summary["realtime_connectivity_percentage"] = (
            rt_connected_count / len(df) * 100
        )

    # Connectivity type analysis
    if "connectivity_type" in df.columns:

        if not all(df.connectivity_type.isna()):
            from collections import Counter

            type_counts = dict(Counter(df.connectivity_type.dropna().to_list()))
            summary["connectivity_types_breakdown"] = type_counts

    # Data source analysis
    if "connectivity_RT_datasource" in df.columns:
        datasource_counts = (
            df["connectivity_RT_datasource"].value_counts().to_dict()
        )
        summary["realtime_connectivity_datasources"] = datasource_counts

    if "school_data_source" in df.columns:
        school_datasource_counts = df["school_data_source"].value_counts().to_dict()
        summary["school_data_sources"] = school_datasource_counts

    self.logger.info("Generated connectivity summary")
    return summary

google_open_buildings

GoogleOpenBuildingsConfig dataclass

Bases: BaseHandlerConfig

Configuration for Google Open Buildings dataset files. Implements the BaseHandlerConfig interface for data unit resolution.

Source code in gigaspatial/handlers/google_open_buildings.py
@dataclass
class GoogleOpenBuildingsConfig(BaseHandlerConfig):
    """
    Configuration for Google Open Buildings dataset files.
    Implements the BaseHandlerConfig interface for data unit resolution.
    """

    TILES_URL: str = (
        "https://openbuildings-public-dot-gweb-research.uw.r.appspot.com/public/tiles.geojson"
    )
    base_path: Path = global_config.get_path("google_open_buildings", "bronze")
    data_types: tuple = ("polygons", "points")

    def __post_init__(self):
        super().__post_init__()
        self._load_s2_tiles()

    def _load_s2_tiles(self):
        """Load S2 tiles from GeoJSON file."""
        response = requests.get(self.TILES_URL)
        response.raise_for_status()
        self.tiles_gdf = gpd.GeoDataFrame.from_features(
            response.json()["features"], crs="EPSG:4326"
        )

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[dict]:
        """
        Return intersecting tiles for a given geometry or GeoDataFrame.
        """
        return self._get_relevant_tiles(geometry)

    def get_relevant_data_units_by_points(
        self, points: Iterable[Union[Point, tuple]], **kwargs
    ) -> List[dict]:
        """
        Return intersecting tiles for a list of points.
        """
        return self._get_relevant_tiles(points)

    def get_data_unit_path(
        self,
        unit: Union[pd.Series, dict, str],
        data_type: str = "polygons",
        **kwargs,
    ) -> Path:
        """
        Given a tile row or tile_id, return the corresponding file path.
        """
        tile_id = (
            unit["tile_id"]
            if isinstance(unit, pd.Series) or isinstance(unit, dict)
            else unit
        )
        return self.base_path / f"{data_type}_s2_level_4_{tile_id}_buildings.csv.gz"

    def get_data_unit_paths(
        self,
        units: Union[pd.DataFrame, Iterable[Union[dict, str]]],
        data_type: str = "polygons",
        **kwargs,
    ) -> list:
        """
        Given data unit identifiers, return the corresponding file paths.
        """
        if isinstance(units, pd.DataFrame):
            return [
                self.get_data_unit_path(row, data_type=data_type, **kwargs)
                for _, row in units.iterrows()
            ]
        return super().get_data_unit_paths(units, data_type=data_type)

    def _get_relevant_tiles(
        self,
        source: Union[
            BaseGeometry,
            gpd.GeoDataFrame,
            Iterable[Union[Point, tuple]],
        ],
    ) -> List[dict]:
        """
        Identify and return the S2 tiles that spatially intersect with the given geometry.
        """
        if isinstance(source, gpd.GeoDataFrame):
            if source.crs != "EPSG:4326":
                source = source.to_crs("EPSG:4326")
            search_geom = source.geometry.unary_union
        elif isinstance(source, BaseGeometry):
            search_geom = source
        elif isinstance(source, Iterable) and all(
            len(pt) == 2 or isinstance(pt, Point) for pt in source
        ):
            points = [
                pt if isinstance(pt, Point) else Point(pt[1], pt[0]) for pt in source
            ]
            search_geom = MultiPoint(points)
        else:
            raise ValueError(
                f"Expected Geometry, GeoDataFrame or iterable object of Points got {source.__class__}"
            )
        mask = (
            tile_geom.intersects(search_geom) for tile_geom in self.tiles_gdf.geometry
        )
        return self.tiles_gdf.loc[mask, ["tile_id", "tile_url", "size_mb"]].to_dict(
            "records"
        )
get_data_unit_path(unit, data_type='polygons', **kwargs)

Given a tile row or tile_id, return the corresponding file path.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_data_unit_path(
    self,
    unit: Union[pd.Series, dict, str],
    data_type: str = "polygons",
    **kwargs,
) -> Path:
    """
    Given a tile row or tile_id, return the corresponding file path.
    """
    tile_id = (
        unit["tile_id"]
        if isinstance(unit, pd.Series) or isinstance(unit, dict)
        else unit
    )
    return self.base_path / f"{data_type}_s2_level_4_{tile_id}_buildings.csv.gz"
get_data_unit_paths(units, data_type='polygons', **kwargs)

Given data unit identifiers, return the corresponding file paths.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_data_unit_paths(
    self,
    units: Union[pd.DataFrame, Iterable[Union[dict, str]]],
    data_type: str = "polygons",
    **kwargs,
) -> list:
    """
    Given data unit identifiers, return the corresponding file paths.
    """
    if isinstance(units, pd.DataFrame):
        return [
            self.get_data_unit_path(row, data_type=data_type, **kwargs)
            for _, row in units.iterrows()
        ]
    return super().get_data_unit_paths(units, data_type=data_type)
get_relevant_data_units_by_geometry(geometry, **kwargs)

Return intersecting tiles for a given geometry or GeoDataFrame.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> List[dict]:
    """
    Return intersecting tiles for a given geometry or GeoDataFrame.
    """
    return self._get_relevant_tiles(geometry)
get_relevant_data_units_by_points(points, **kwargs)

Return intersecting tiles for a list of points.

Source code in gigaspatial/handlers/google_open_buildings.py
def get_relevant_data_units_by_points(
    self, points: Iterable[Union[Point, tuple]], **kwargs
) -> List[dict]:
    """
    Return intersecting tiles for a list of points.
    """
    return self._get_relevant_tiles(points)

GoogleOpenBuildingsDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of Google's Open Buildings dataset.

Source code in gigaspatial/handlers/google_open_buildings.py
class GoogleOpenBuildingsDownloader(BaseHandlerDownloader):
    """A class to handle downloads of Google's Open Buildings dataset."""

    def __init__(
        self,
        config: Optional[GoogleOpenBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Optional configuration for file paths and download settings.
                    If None, a default `GoogleOpenBuildingsConfig` is used.
            data_store: Optional instance of a `DataStore` for managing data
                        storage. If None, a `LocalDataStore` is used.
            logger: Optional custom logger instance. If None, a default logger
                    named after the module is created and used.
        """
        config = config or GoogleOpenBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_info: Union[pd.Series, dict],
        data_type: Literal["polygons", "points"] = "polygons",
    ) -> Optional[str]:
        """Download data file for a single tile."""

        tile_url = tile_info["tile_url"]
        if data_type == "points":
            tile_url = tile_url.replace("polygons", "points")

        try:
            response = requests.get(tile_url, stream=True)
            response.raise_for_status()

            file_path = str(
                self.config.get_data_unit_path(
                    tile_info["tile_id"], data_type=data_type
                )
            )

            with self.data_store.open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

                self.logger.debug(
                    f"Successfully downloaded tile: {tile_info['tile_id']}"
                )
                return file_path

        except requests.exceptions.RequestException as e:
            self.logger.error(
                f"Failed to download tile {tile_info['tile_id']}: {str(e)}"
            )
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
            return None

    def download_data_units(
        self,
        tiles: Union[pd.DataFrame, List[dict]],
        data_type: Literal["polygons", "points"] = "polygons",
    ) -> List[str]:
        """Download data files for multiple tiles."""

        if len(tiles) == 0:
            self.logger.warning(f"There is no matching data")
            return []

        with multiprocessing.Pool(self.config.n_workers) as pool:
            download_func = functools.partial(
                self.download_data_unit, data_type=data_type
            )
            file_paths = list(
                tqdm(
                    pool.imap(
                        download_func,
                        (
                            [row for _, row in tiles.iterrows()]
                            if isinstance(tiles, pd.DataFrame)
                            else tiles
                        ),
                    ),
                    total=len(tiles),
                    desc=f"Downloading {data_type} data",
                )
            )

        return [path for path in file_paths if path is not None]

    def download(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # shapely geoms
            gpd.GeoDataFrame,
        ],
        data_type: Literal["polygons", "points"] = "polygons",
        **kwargs,
    ) -> List[str]:
        """Download Google Open Buildings data for a specified geographic region.

        The region can be defined by a country code/name, a list of points,
        a Shapely geometry, or a GeoDataFrame. This method identifies the
        relevant S2 tiles intersecting the region and downloads the
        specified type of data (polygons or points) for those tiles in parallel.

        Args:
            source: Defines the geographic area for which to download data.
                    Can be:
                      - A string representing a country code or name.
                      - A list of (latitude, longitude) tuples or Shapely Point objects.
                      - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                      - A GeoDataFrame with geometry column in EPSG:4326.
            data_type: The type of building data to download ('polygons' or 'points').
                       Defaults to 'polygons'.
            **kwargs: Additional keyword arguments that are passed to
                      `AdminBoundaries.create()` if `source` is a country code.
                      For example, `path` to a custom boundaries file.

        Returns:
            A list of local file paths for the successfully downloaded tiles.
            Returns an empty list if no data is found for the region or if
            all downloads fail.
        """

        tiles = self.config.get_relevant_data_units(source, **kwargs)
        return self.download_data_units(tiles, data_type)

    def download_by_country(
        self,
        country: str,
        data_type: Literal["polygons", "points"] = "polygons",
        data_store: Optional[DataStore] = None,
        country_geom_path: Optional[Union[str, Path]] = None,
    ) -> List[str]:
        """
        Download Google Open Buildings data for a specific country.

        This is a convenience method to download data for an entire country
        using its code or name.

        Args:
            country: The country code (e.g., 'USA', 'GBR') or name.
            data_type: The type of building data to download ('polygons' or 'points').
                       Defaults to 'polygons'.
            data_store: Optional instance of a `DataStore` to be used by
                        `AdminBoundaries` for loading country boundaries. If None,
                        `AdminBoundaries` will use its default data loading.
            country_geom_path: Optional path to a GeoJSON file containing the
                               country boundary. If provided, this boundary is used
                               instead of the default from `AdminBoundaries`.

        Returns:
            A list of local file paths for the successfully downloaded tiles
            for the specified country.
        """
        return self.download(
            source=country,
            data_type=data_type,
            data_store=data_store,
            path=country_geom_path,
        )
__init__(config=None, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Optional[GoogleOpenBuildingsConfig]

Optional configuration for file paths and download settings. If None, a default GoogleOpenBuildingsConfig is used.

None
data_store Optional[DataStore]

Optional instance of a DataStore for managing data storage. If None, a LocalDataStore is used.

None
logger Optional[Logger]

Optional custom logger instance. If None, a default logger named after the module is created and used.

None
Source code in gigaspatial/handlers/google_open_buildings.py
def __init__(
    self,
    config: Optional[GoogleOpenBuildingsConfig] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Optional configuration for file paths and download settings.
                If None, a default `GoogleOpenBuildingsConfig` is used.
        data_store: Optional instance of a `DataStore` for managing data
                    storage. If None, a `LocalDataStore` is used.
        logger: Optional custom logger instance. If None, a default logger
                named after the module is created and used.
    """
    config = config or GoogleOpenBuildingsConfig()
    super().__init__(config=config, data_store=data_store, logger=logger)
download(source, data_type='polygons', **kwargs)

Download Google Open Buildings data for a specified geographic region.

The region can be defined by a country code/name, a list of points, a Shapely geometry, or a GeoDataFrame. This method identifies the relevant S2 tiles intersecting the region and downloads the specified type of data (polygons or points) for those tiles in parallel.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame]

Defines the geographic area for which to download data. Can be: - A string representing a country code or name. - A list of (latitude, longitude) tuples or Shapely Point objects. - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon). - A GeoDataFrame with geometry column in EPSG:4326.

required
data_type Literal['polygons', 'points']

The type of building data to download ('polygons' or 'points'). Defaults to 'polygons'.

'polygons'
**kwargs

Additional keyword arguments that are passed to AdminBoundaries.create() if source is a country code. For example, path to a custom boundaries file.

{}

Returns:

Type Description
List[str]

A list of local file paths for the successfully downloaded tiles.

List[str]

Returns an empty list if no data is found for the region or if

List[str]

all downloads fail.

Source code in gigaspatial/handlers/google_open_buildings.py
def download(
    self,
    source: Union[
        str,  # country
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,  # shapely geoms
        gpd.GeoDataFrame,
    ],
    data_type: Literal["polygons", "points"] = "polygons",
    **kwargs,
) -> List[str]:
    """Download Google Open Buildings data for a specified geographic region.

    The region can be defined by a country code/name, a list of points,
    a Shapely geometry, or a GeoDataFrame. This method identifies the
    relevant S2 tiles intersecting the region and downloads the
    specified type of data (polygons or points) for those tiles in parallel.

    Args:
        source: Defines the geographic area for which to download data.
                Can be:
                  - A string representing a country code or name.
                  - A list of (latitude, longitude) tuples or Shapely Point objects.
                  - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                  - A GeoDataFrame with geometry column in EPSG:4326.
        data_type: The type of building data to download ('polygons' or 'points').
                   Defaults to 'polygons'.
        **kwargs: Additional keyword arguments that are passed to
                  `AdminBoundaries.create()` if `source` is a country code.
                  For example, `path` to a custom boundaries file.

    Returns:
        A list of local file paths for the successfully downloaded tiles.
        Returns an empty list if no data is found for the region or if
        all downloads fail.
    """

    tiles = self.config.get_relevant_data_units(source, **kwargs)
    return self.download_data_units(tiles, data_type)
download_by_country(country, data_type='polygons', data_store=None, country_geom_path=None)

Download Google Open Buildings data for a specific country.

This is a convenience method to download data for an entire country using its code or name.

Parameters:

Name Type Description Default
country str

The country code (e.g., 'USA', 'GBR') or name.

required
data_type Literal['polygons', 'points']

The type of building data to download ('polygons' or 'points'). Defaults to 'polygons'.

'polygons'
data_store Optional[DataStore]

Optional instance of a DataStore to be used by AdminBoundaries for loading country boundaries. If None, AdminBoundaries will use its default data loading.

None
country_geom_path Optional[Union[str, Path]]

Optional path to a GeoJSON file containing the country boundary. If provided, this boundary is used instead of the default from AdminBoundaries.

None

Returns:

Type Description
List[str]

A list of local file paths for the successfully downloaded tiles

List[str]

for the specified country.

Source code in gigaspatial/handlers/google_open_buildings.py
def download_by_country(
    self,
    country: str,
    data_type: Literal["polygons", "points"] = "polygons",
    data_store: Optional[DataStore] = None,
    country_geom_path: Optional[Union[str, Path]] = None,
) -> List[str]:
    """
    Download Google Open Buildings data for a specific country.

    This is a convenience method to download data for an entire country
    using its code or name.

    Args:
        country: The country code (e.g., 'USA', 'GBR') or name.
        data_type: The type of building data to download ('polygons' or 'points').
                   Defaults to 'polygons'.
        data_store: Optional instance of a `DataStore` to be used by
                    `AdminBoundaries` for loading country boundaries. If None,
                    `AdminBoundaries` will use its default data loading.
        country_geom_path: Optional path to a GeoJSON file containing the
                           country boundary. If provided, this boundary is used
                           instead of the default from `AdminBoundaries`.

    Returns:
        A list of local file paths for the successfully downloaded tiles
        for the specified country.
    """
    return self.download(
        source=country,
        data_type=data_type,
        data_store=data_store,
        path=country_geom_path,
    )
download_data_unit(tile_info, data_type='polygons')

Download data file for a single tile.

Source code in gigaspatial/handlers/google_open_buildings.py
def download_data_unit(
    self,
    tile_info: Union[pd.Series, dict],
    data_type: Literal["polygons", "points"] = "polygons",
) -> Optional[str]:
    """Download data file for a single tile."""

    tile_url = tile_info["tile_url"]
    if data_type == "points":
        tile_url = tile_url.replace("polygons", "points")

    try:
        response = requests.get(tile_url, stream=True)
        response.raise_for_status()

        file_path = str(
            self.config.get_data_unit_path(
                tile_info["tile_id"], data_type=data_type
            )
        )

        with self.data_store.open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

            self.logger.debug(
                f"Successfully downloaded tile: {tile_info['tile_id']}"
            )
            return file_path

    except requests.exceptions.RequestException as e:
        self.logger.error(
            f"Failed to download tile {tile_info['tile_id']}: {str(e)}"
        )
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
        return None
download_data_units(tiles, data_type='polygons')

Download data files for multiple tiles.

Source code in gigaspatial/handlers/google_open_buildings.py
def download_data_units(
    self,
    tiles: Union[pd.DataFrame, List[dict]],
    data_type: Literal["polygons", "points"] = "polygons",
) -> List[str]:
    """Download data files for multiple tiles."""

    if len(tiles) == 0:
        self.logger.warning(f"There is no matching data")
        return []

    with multiprocessing.Pool(self.config.n_workers) as pool:
        download_func = functools.partial(
            self.download_data_unit, data_type=data_type
        )
        file_paths = list(
            tqdm(
                pool.imap(
                    download_func,
                    (
                        [row for _, row in tiles.iterrows()]
                        if isinstance(tiles, pd.DataFrame)
                        else tiles
                    ),
                ),
                total=len(tiles),
                desc=f"Downloading {data_type} data",
            )
        )

    return [path for path in file_paths if path is not None]

GoogleOpenBuildingsHandler

Bases: BaseHandler

Handler for Google Open Buildings dataset.

This class provides a unified interface for downloading and loading Google Open Buildings data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/google_open_buildings.py
class GoogleOpenBuildingsHandler(BaseHandler):
    """
    Handler for Google Open Buildings dataset.

    This class provides a unified interface for downloading and loading Google Open Buildings data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> GoogleOpenBuildingsConfig:
        """
        Create and return a GoogleOpenBuildingsConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured GoogleOpenBuildingsConfig instance
        """
        return GoogleOpenBuildingsConfig(data_store=data_store, logger=logger, **kwargs)

    def create_downloader(
        self,
        config: GoogleOpenBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GoogleOpenBuildingsDownloader:
        """
        Create and return a GoogleOpenBuildingsDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured GoogleOpenBuildingsDownloader instance
        """
        return GoogleOpenBuildingsDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: GoogleOpenBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> GoogleOpenBuildingsReader:
        """
        Create and return a GoogleOpenBuildingsReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured GoogleOpenBuildingsReader instance
        """
        return GoogleOpenBuildingsReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def load_points(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        ensure_available: bool = True,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Load point data from Google Open Buildings dataset.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing building point data
        """
        return self.load_data(
            source=source,
            ensure_available=ensure_available,
            data_type="points",
            **kwargs,
        )

    def load_polygons(
        self,
        source: Union[
            str,  # country
            List[Union[tuple, Point]],  # points
            BaseGeometry,  # geometry
            gpd.GeoDataFrame,  # geodataframe
            Path,  # path
            List[Union[str, Path]],  # list of paths
        ],
        ensure_available: bool = True,
        **kwargs,
    ) -> gpd.GeoDataFrame:
        """
        Load polygon data from Google Open Buildings dataset.

        Args:
            source: The data source specification
            ensure_available: If True, ensure data is downloaded before loading
            **kwargs: Additional parameters passed to load methods

        Returns:
            GeoDataFrame containing building polygon data
        """
        return self.load_data(
            source=source,
            ensure_available=ensure_available,
            data_type="polygons",
            **kwargs,
        )
create_config(data_store, logger, **kwargs)

Create and return a GoogleOpenBuildingsConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
GoogleOpenBuildingsConfig

Configured GoogleOpenBuildingsConfig instance

Source code in gigaspatial/handlers/google_open_buildings.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> GoogleOpenBuildingsConfig:
    """
    Create and return a GoogleOpenBuildingsConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured GoogleOpenBuildingsConfig instance
    """
    return GoogleOpenBuildingsConfig(data_store=data_store, logger=logger, **kwargs)
create_downloader(config, data_store, logger, **kwargs)

Create and return a GoogleOpenBuildingsDownloader instance.

Parameters:

Name Type Description Default
config GoogleOpenBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
GoogleOpenBuildingsDownloader

Configured GoogleOpenBuildingsDownloader instance

Source code in gigaspatial/handlers/google_open_buildings.py
def create_downloader(
    self,
    config: GoogleOpenBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GoogleOpenBuildingsDownloader:
    """
    Create and return a GoogleOpenBuildingsDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured GoogleOpenBuildingsDownloader instance
    """
    return GoogleOpenBuildingsDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a GoogleOpenBuildingsReader instance.

Parameters:

Name Type Description Default
config GoogleOpenBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
GoogleOpenBuildingsReader

Configured GoogleOpenBuildingsReader instance

Source code in gigaspatial/handlers/google_open_buildings.py
def create_reader(
    self,
    config: GoogleOpenBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> GoogleOpenBuildingsReader:
    """
    Create and return a GoogleOpenBuildingsReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured GoogleOpenBuildingsReader instance
    """
    return GoogleOpenBuildingsReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
load_points(source, ensure_available=True, **kwargs)

Load point data from Google Open Buildings dataset.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing building point data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_points(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    ensure_available: bool = True,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Load point data from Google Open Buildings dataset.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing building point data
    """
    return self.load_data(
        source=source,
        ensure_available=ensure_available,
        data_type="points",
        **kwargs,
    )
load_polygons(source, ensure_available=True, **kwargs)

Load polygon data from Google Open Buildings dataset.

Parameters:

Name Type Description Default
source Union[str, List[Union[tuple, Point]], BaseGeometry, GeoDataFrame, Path, List[Union[str, Path]]]

The data source specification

required
ensure_available bool

If True, ensure data is downloaded before loading

True
**kwargs

Additional parameters passed to load methods

{}

Returns:

Type Description
GeoDataFrame

GeoDataFrame containing building polygon data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_polygons(
    self,
    source: Union[
        str,  # country
        List[Union[tuple, Point]],  # points
        BaseGeometry,  # geometry
        gpd.GeoDataFrame,  # geodataframe
        Path,  # path
        List[Union[str, Path]],  # list of paths
    ],
    ensure_available: bool = True,
    **kwargs,
) -> gpd.GeoDataFrame:
    """
    Load polygon data from Google Open Buildings dataset.

    Args:
        source: The data source specification
        ensure_available: If True, ensure data is downloaded before loading
        **kwargs: Additional parameters passed to load methods

    Returns:
        GeoDataFrame containing building polygon data
    """
    return self.load_data(
        source=source,
        ensure_available=ensure_available,
        data_type="polygons",
        **kwargs,
    )

GoogleOpenBuildingsReader

Bases: BaseHandlerReader

Reader for Google Open Buildings data, supporting country, points, and geometry-based resolution.

Source code in gigaspatial/handlers/google_open_buildings.py
class GoogleOpenBuildingsReader(BaseHandlerReader):
    """
    Reader for Google Open Buildings data, supporting country, points, and geometry-based resolution.
    """

    def __init__(
        self,
        config: Optional[GoogleOpenBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config or GoogleOpenBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> gpd.GeoDataFrame:
        """
        Load building data from Google Open Buildings dataset.
        Args:
            source_data_path: List of file paths to load
        Returns:
            GeoDataFrame containing building data
        """
        result = self._load_tabular_data(file_paths=source_data_path)
        return result

    def load(self, source, data_type="polygons", **kwargs):
        return super().load(source=source, data_type=data_type, **kwargs)

    def load_points(self, source, **kwargs):
        """This is a convenience method to load points data"""
        return self.load(source=source, data_type="points", **kwargs)

    def load_polygons(self, source, **kwargs):
        """This is a convenience method to load polygons data"""
        return self.load(source=source, data_type="polygons", **kwargs)
load_from_paths(source_data_path, **kwargs)

Load building data from Google Open Buildings dataset. Args: source_data_path: List of file paths to load Returns: GeoDataFrame containing building data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> gpd.GeoDataFrame:
    """
    Load building data from Google Open Buildings dataset.
    Args:
        source_data_path: List of file paths to load
    Returns:
        GeoDataFrame containing building data
    """
    result = self._load_tabular_data(file_paths=source_data_path)
    return result
load_points(source, **kwargs)

This is a convenience method to load points data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_points(self, source, **kwargs):
    """This is a convenience method to load points data"""
    return self.load(source=source, data_type="points", **kwargs)
load_polygons(source, **kwargs)

This is a convenience method to load polygons data

Source code in gigaspatial/handlers/google_open_buildings.py
def load_polygons(self, source, **kwargs):
    """This is a convenience method to load polygons data"""
    return self.load(source=source, data_type="polygons", **kwargs)

hdx

HDXConfig dataclass

Bases: BaseHandlerConfig

Configuration for HDX data access

Source code in gigaspatial/handlers/hdx.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class HDXConfig(BaseHandlerConfig):
    """Configuration for HDX data access"""

    # User configuration
    dataset_name: str = Field(
        default=..., description="Name of the HDX dataset to download"
    )

    # Optional configuration with defaults
    base_path: Path = Field(default=global_config.get_path("hdx", "bronze"))
    user_agent: str = Field(
        default="gigaspatial", description="User agent for HDX API requests"
    )
    hdx_site: str = Field(default="prod", description="HDX site to use (prod or test)")

    # Internal state
    _hdx_configured: bool = Field(default=False, init=False)
    dataset: Optional[Dataset] = Field(default=None, init=False)

    @staticmethod
    def search_datasets(
        query: str,
        rows: int = None,
        sort: str = "relevance asc, metadata_modified desc",
        hdx_site: str = "prod",
        user_agent: str = "gigaspatial",
    ) -> List[Dict]:
        """Search for datasets in HDX before initializing the class.

        Args:
            query: Search query string
            rows: Number of results per page. Defaults to all datasets (sys.maxsize).
            sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
            hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
            user_agent: User agent for HDX API requests (default: 'gigaspatial')

        Returns:
            List of dataset dictionaries containing search results

        Example:
            >>> results = HDXConfig.search_datasets("population", rows=5)
            >>> for dataset in results:
            >>>     print(f"Name: {dataset['name']}, Title: {dataset['title']}")
        """
        try:
            Configuration.create(
                hdx_site=hdx_site,
                user_agent=user_agent,
                hdx_read_only=True,
            )
        except:
            pass

        try:
            results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)

            return results
        except Exception as e:
            logging.error(f"Error searching HDX datasets: {str(e)}")
            raise

    def __post_init__(self):
        super().__post_init__()
        try:
            Configuration.read()
            self._hdx_configured = True
        except Exception:
            self._hdx_configured = False
        self.configure_hdx()
        self.dataset = self.fetch_dataset()

    @property
    def output_dir_path(self) -> Path:
        """Path to save the downloaded HDX dataset"""
        return self.base_path / self.dataset_name

    def configure_hdx(self):
        """Configure HDX API if not already configured"""
        if not self._hdx_configured:
            try:
                Configuration.create(
                    hdx_site=self.hdx_site,
                    user_agent=self.user_agent,
                    hdx_read_only=True,
                )
                self._hdx_configured = True
            except Exception as e:
                self.logger.error(f"Error configuring HDX API: {str(e)}")
                raise

    def fetch_dataset(self) -> Dataset:
        """Get the HDX dataset"""
        try:
            self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
            dataset = Dataset.read_from_hdx(self.dataset_name)
            if not dataset:
                raise ValueError(
                    f"Dataset '{self.dataset_name}' not found on HDX. "
                    "Please verify the dataset name or use search_datasets() "
                    "to find available datasets."
                )
            return dataset
        except Exception as e:
            self.logger.error(f"Error fetching HDX dataset: {str(e)}")
            raise

    def _match_pattern(self, value: str, pattern: str) -> bool:
        """Check if a value matches a pattern"""
        if isinstance(pattern, str):
            return pattern.lower() in value.lower()
        return value == pattern

    def _get_patterns_for_value(self, value: Any) -> List[str]:
        """Generate patterns for a given value or list of values"""
        if isinstance(value, list):
            patterns = []
            for v in value:
                patterns.extend(self._get_patterns_for_value(v))
            return patterns

        if not isinstance(value, str):
            return [value]

        patterns = []
        value = value.lower()

        # Add exact match
        patterns.append(value)

        # Add common variations
        patterns.extend(
            [
                f"/{value}_",  # URL path with prefix
                f"/{value}.",  # URL path with extension
                f"_{value}_",  # Filename with value in middle
                f"_{value}.",  # Filename with value at end
            ]
        )

        # If value contains spaces, generate additional patterns
        if " " in value:
            # Generate patterns for space-less version
            no_space = value.replace(" ", "")
            patterns.extend(self._get_patterns_for_value(no_space))

            # Generate patterns for hyphenated version
            hyphenated = value.replace(" ", "-")
            patterns.extend(self._get_patterns_for_value(hyphenated))

        return patterns

    def get_dataset_resources(
        self, filter: Optional[Dict[str, Any]] = None, exact_match: bool = False
    ) -> List[Resource]:
        """Get resources from the HDX dataset

        Args:
            filter: Dictionary of key-value pairs to filter resources
            exact_match: If True, perform exact matching. If False, use pattern matching
        """
        try:
            resources = self.dataset.get_resources()

            # Apply resource filter if specified
            if filter:
                filtered_resources = []
                for res in resources:
                    match = True
                    for key, value in filter.items():
                        if key not in res.data:
                            match = False
                            break

                        if exact_match:
                            # For exact matching, check if value matches or is in list of values
                            if isinstance(value, list):
                                if res.data[key] not in value:
                                    match = False
                                    break
                            elif res.data[key] != value:
                                match = False
                                break
                        else:
                            # For pattern matching, generate patterns for value(s)
                            patterns = self._get_patterns_for_value(value)
                            if not any(
                                self._match_pattern(str(res.data[key]), pattern)
                                for pattern in patterns
                            ):
                                match = False
                                break

                    if match:
                        filtered_resources.append(res)
                resources = filtered_resources

            return resources
        except Exception as e:
            self.logger.error(f"Error getting dataset resources: {str(e)}")
            raise

    def get_relevant_data_units(
        self, source: Union[str, Dict], **kwargs
    ) -> List[Resource]:
        """Get relevant data units based on the source type

        Args:
            source: Either a country name/code (str) or a filter dictionary
            **kwargs: Additional keyword arguments passed to the specific method

        Returns:
            List of matching resources
        """
        if isinstance(source, str):
            # If source is a string, assume it's a country and use country-based filtering
            return self.get_relevant_data_units_by_country(source, **kwargs)
        elif isinstance(source, dict):
            # If source is a dict, use it directly as a filter
            return self.get_dataset_resources(filter=source, **kwargs)
        else:
            raise ValueError(f"Unsupported source type: {type(source)}")

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> List[Resource]:
        raise NotImplementedError(
            "HDX does not support geometry-based filtering. "
            "Please use country-based filtering or direct resource filtering instead."
        )

    def get_relevant_data_units_by_points(
        self, points: List[Union[Point, tuple]], **kwargs
    ) -> List[Resource]:
        raise NotImplementedError(
            "HDX does not support point-based filtering. "
            "Please use country-based filtering or direct resource filtering instead."
        )

    def get_relevant_data_units_by_country(
        self,
        country: str,
        key: str = "url",
        **kwargs,
    ) -> Any:
        """Get relevant data units for a country

        Args:
            country: Country name or code
            key: The key to filter on in the resource data
            patterns: List of patterns to match against the resource data
            **kwargs: Additional keyword arguments
        """
        country = pycountry.countries.lookup(country)
        values = [country.alpha_3, country.alpha_2, country.name]
        return self.get_dataset_resources(
            filter={key: values},
        )

    def get_data_unit_path(self, unit: str, **kwargs) -> str:
        """Get the path for a data unit"""
        try:
            filename = unit.data["name"]
        except:
            filename = unit.get("download_url").split("/")[-1]

        return self.output_dir_path / filename

    def list_resources(self) -> List[str]:
        """List all resources in the dataset directory using the data_store."""
        dataset_folder = str(self.output_dir_path)
        # Check if the dataset directory exists in the data_store
        if not (
            self.data_store.is_dir(dataset_folder)
            or self.data_store.file_exists(dataset_folder)
        ):
            raise FileNotFoundError(
                f"HDX dataset not found at {dataset_folder}. "
                "Download the data first using HDXDownloader."
            )
        return self.data_store.list_files(dataset_folder)

    def __repr__(self) -> str:
        return (
            f"HDXConfig(\n"
            f"  dataset_name='{self.dataset_name}'\n"
            f"  base_path='{self.base_path}'\n"
            f"  hdx_site='{self.hdx_site}'\n"
            f"  user_agent='{self.user_agent}'\n"
            f")"
        )
output_dir_path: Path property

Path to save the downloaded HDX dataset

configure_hdx()

Configure HDX API if not already configured

Source code in gigaspatial/handlers/hdx.py
def configure_hdx(self):
    """Configure HDX API if not already configured"""
    if not self._hdx_configured:
        try:
            Configuration.create(
                hdx_site=self.hdx_site,
                user_agent=self.user_agent,
                hdx_read_only=True,
            )
            self._hdx_configured = True
        except Exception as e:
            self.logger.error(f"Error configuring HDX API: {str(e)}")
            raise
fetch_dataset()

Get the HDX dataset

Source code in gigaspatial/handlers/hdx.py
def fetch_dataset(self) -> Dataset:
    """Get the HDX dataset"""
    try:
        self.logger.info(f"Fetching HDX dataset: {self.dataset_name}")
        dataset = Dataset.read_from_hdx(self.dataset_name)
        if not dataset:
            raise ValueError(
                f"Dataset '{self.dataset_name}' not found on HDX. "
                "Please verify the dataset name or use search_datasets() "
                "to find available datasets."
            )
        return dataset
    except Exception as e:
        self.logger.error(f"Error fetching HDX dataset: {str(e)}")
        raise
get_data_unit_path(unit, **kwargs)

Get the path for a data unit

Source code in gigaspatial/handlers/hdx.py
def get_data_unit_path(self, unit: str, **kwargs) -> str:
    """Get the path for a data unit"""
    try:
        filename = unit.data["name"]
    except:
        filename = unit.get("download_url").split("/")[-1]

    return self.output_dir_path / filename
get_dataset_resources(filter=None, exact_match=False)

Get resources from the HDX dataset

Parameters:

Name Type Description Default
filter Optional[Dict[str, Any]]

Dictionary of key-value pairs to filter resources

None
exact_match bool

If True, perform exact matching. If False, use pattern matching

False
Source code in gigaspatial/handlers/hdx.py
def get_dataset_resources(
    self, filter: Optional[Dict[str, Any]] = None, exact_match: bool = False
) -> List[Resource]:
    """Get resources from the HDX dataset

    Args:
        filter: Dictionary of key-value pairs to filter resources
        exact_match: If True, perform exact matching. If False, use pattern matching
    """
    try:
        resources = self.dataset.get_resources()

        # Apply resource filter if specified
        if filter:
            filtered_resources = []
            for res in resources:
                match = True
                for key, value in filter.items():
                    if key not in res.data:
                        match = False
                        break

                    if exact_match:
                        # For exact matching, check if value matches or is in list of values
                        if isinstance(value, list):
                            if res.data[key] not in value:
                                match = False
                                break
                        elif res.data[key] != value:
                            match = False
                            break
                    else:
                        # For pattern matching, generate patterns for value(s)
                        patterns = self._get_patterns_for_value(value)
                        if not any(
                            self._match_pattern(str(res.data[key]), pattern)
                            for pattern in patterns
                        ):
                            match = False
                            break

                if match:
                    filtered_resources.append(res)
            resources = filtered_resources

        return resources
    except Exception as e:
        self.logger.error(f"Error getting dataset resources: {str(e)}")
        raise
get_relevant_data_units(source, **kwargs)

Get relevant data units based on the source type

Parameters:

Name Type Description Default
source Union[str, Dict]

Either a country name/code (str) or a filter dictionary

required
**kwargs

Additional keyword arguments passed to the specific method

{}

Returns:

Type Description
List[Resource]

List of matching resources

Source code in gigaspatial/handlers/hdx.py
def get_relevant_data_units(
    self, source: Union[str, Dict], **kwargs
) -> List[Resource]:
    """Get relevant data units based on the source type

    Args:
        source: Either a country name/code (str) or a filter dictionary
        **kwargs: Additional keyword arguments passed to the specific method

    Returns:
        List of matching resources
    """
    if isinstance(source, str):
        # If source is a string, assume it's a country and use country-based filtering
        return self.get_relevant_data_units_by_country(source, **kwargs)
    elif isinstance(source, dict):
        # If source is a dict, use it directly as a filter
        return self.get_dataset_resources(filter=source, **kwargs)
    else:
        raise ValueError(f"Unsupported source type: {type(source)}")
get_relevant_data_units_by_country(country, key='url', **kwargs)

Get relevant data units for a country

Parameters:

Name Type Description Default
country str

Country name or code

required
key str

The key to filter on in the resource data

'url'
patterns

List of patterns to match against the resource data

required
**kwargs

Additional keyword arguments

{}
Source code in gigaspatial/handlers/hdx.py
def get_relevant_data_units_by_country(
    self,
    country: str,
    key: str = "url",
    **kwargs,
) -> Any:
    """Get relevant data units for a country

    Args:
        country: Country name or code
        key: The key to filter on in the resource data
        patterns: List of patterns to match against the resource data
        **kwargs: Additional keyword arguments
    """
    country = pycountry.countries.lookup(country)
    values = [country.alpha_3, country.alpha_2, country.name]
    return self.get_dataset_resources(
        filter={key: values},
    )
list_resources()

List all resources in the dataset directory using the data_store.

Source code in gigaspatial/handlers/hdx.py
def list_resources(self) -> List[str]:
    """List all resources in the dataset directory using the data_store."""
    dataset_folder = str(self.output_dir_path)
    # Check if the dataset directory exists in the data_store
    if not (
        self.data_store.is_dir(dataset_folder)
        or self.data_store.file_exists(dataset_folder)
    ):
        raise FileNotFoundError(
            f"HDX dataset not found at {dataset_folder}. "
            "Download the data first using HDXDownloader."
        )
    return self.data_store.list_files(dataset_folder)
search_datasets(query, rows=None, sort='relevance asc, metadata_modified desc', hdx_site='prod', user_agent='gigaspatial') staticmethod

Search for datasets in HDX before initializing the class.

Parameters:

Name Type Description Default
query str

Search query string

required
rows int

Number of results per page. Defaults to all datasets (sys.maxsize).

None
sort str

Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')

'relevance asc, metadata_modified desc'
hdx_site str

HDX site to use - 'prod' or 'test' (default: 'prod')

'prod'
user_agent str

User agent for HDX API requests (default: 'gigaspatial')

'gigaspatial'

Returns:

Type Description
List[Dict]

List of dataset dictionaries containing search results

Example

results = HDXConfig.search_datasets("population", rows=5) for dataset in results: print(f"Name: {dataset['name']}, Title: {dataset['title']}")

Source code in gigaspatial/handlers/hdx.py
@staticmethod
def search_datasets(
    query: str,
    rows: int = None,
    sort: str = "relevance asc, metadata_modified desc",
    hdx_site: str = "prod",
    user_agent: str = "gigaspatial",
) -> List[Dict]:
    """Search for datasets in HDX before initializing the class.

    Args:
        query: Search query string
        rows: Number of results per page. Defaults to all datasets (sys.maxsize).
        sort: Sort order - one of 'relevance', 'views_recent', 'views_total', 'last_modified' (default: 'relevance')
        hdx_site: HDX site to use - 'prod' or 'test' (default: 'prod')
        user_agent: User agent for HDX API requests (default: 'gigaspatial')

    Returns:
        List of dataset dictionaries containing search results

    Example:
        >>> results = HDXConfig.search_datasets("population", rows=5)
        >>> for dataset in results:
        >>>     print(f"Name: {dataset['name']}, Title: {dataset['title']}")
    """
    try:
        Configuration.create(
            hdx_site=hdx_site,
            user_agent=user_agent,
            hdx_read_only=True,
        )
    except:
        pass

    try:
        results = Dataset.search_in_hdx(query=query, rows=rows, sort=sort)

        return results
    except Exception as e:
        logging.error(f"Error searching HDX datasets: {str(e)}")
        raise

HDXDownloader

Bases: BaseHandlerDownloader

Downloader for HDX datasets

Source code in gigaspatial/handlers/hdx.py
class HDXDownloader(BaseHandlerDownloader):
    """Downloader for HDX datasets"""

    def __init__(
        self,
        config: Union[HDXConfig, dict],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(self, resource: str, **kwargs) -> str:
        """Download a single resource"""
        try:
            resource_name = resource.get("name", "Unknown")
            self.logger.info(f"Downloading resource: {resource_name}")

            with tempfile.TemporaryDirectory() as tmpdir:
                url, local_path = resource.download(folder=tmpdir)
                with open(local_path, "rb") as f:
                    data = f.read()
                # Compose the target path in the DataStore
                target_path = str(self.config.get_data_unit_path(resource))
                self.data_store.write_file(target_path, data)
                self.logger.info(
                    f"Downloaded resource: {resource_name} to {target_path}"
                )
                return target_path
        except Exception as e:
            self.logger.error(f"Error downloading resource {resource_name}: {str(e)}")
            return None

    def download_data_units(self, resources: List[Resource], **kwargs) -> List[str]:
        """Download multiple resources sequentially

        Args:
            resources: List of HDX Resource objects
            **kwargs: Additional keyword arguments

        Returns:
            List of paths to downloaded files
        """
        if len(resources) == 0:
            self.logger.warning("There is no resource to download")
            return []

        downloaded_paths = []
        for resource in tqdm(resources, desc="Downloading resources"):
            path = self.download_data_unit(resource)
            if path:
                downloaded_paths.append(path)

        return downloaded_paths

    def download(self, source: Union[Dict, str], **kwargs) -> List[str]:
        """Download data for a source"""
        resources = self.config.get_relevant_data_units(source, **kwargs)
        return self.download_data_units(resources)
download(source, **kwargs)

Download data for a source

Source code in gigaspatial/handlers/hdx.py
def download(self, source: Union[Dict, str], **kwargs) -> List[str]:
    """Download data for a source"""
    resources = self.config.get_relevant_data_units(source, **kwargs)
    return self.download_data_units(resources)
download_data_unit(resource, **kwargs)

Download a single resource

Source code in gigaspatial/handlers/hdx.py
def download_data_unit(self, resource: str, **kwargs) -> str:
    """Download a single resource"""
    try:
        resource_name = resource.get("name", "Unknown")
        self.logger.info(f"Downloading resource: {resource_name}")

        with tempfile.TemporaryDirectory() as tmpdir:
            url, local_path = resource.download(folder=tmpdir)
            with open(local_path, "rb") as f:
                data = f.read()
            # Compose the target path in the DataStore
            target_path = str(self.config.get_data_unit_path(resource))
            self.data_store.write_file(target_path, data)
            self.logger.info(
                f"Downloaded resource: {resource_name} to {target_path}"
            )
            return target_path
    except Exception as e:
        self.logger.error(f"Error downloading resource {resource_name}: {str(e)}")
        return None
download_data_units(resources, **kwargs)

Download multiple resources sequentially

Parameters:

Name Type Description Default
resources List[Resource]

List of HDX Resource objects

required
**kwargs

Additional keyword arguments

{}

Returns:

Type Description
List[str]

List of paths to downloaded files

Source code in gigaspatial/handlers/hdx.py
def download_data_units(self, resources: List[Resource], **kwargs) -> List[str]:
    """Download multiple resources sequentially

    Args:
        resources: List of HDX Resource objects
        **kwargs: Additional keyword arguments

    Returns:
        List of paths to downloaded files
    """
    if len(resources) == 0:
        self.logger.warning("There is no resource to download")
        return []

    downloaded_paths = []
    for resource in tqdm(resources, desc="Downloading resources"):
        path = self.download_data_unit(resource)
        if path:
            downloaded_paths.append(path)

    return downloaded_paths

HDXHandler

Bases: BaseHandler

Handler for HDX datasets

Source code in gigaspatial/handlers/hdx.py
class HDXHandler(BaseHandler):
    """Handler for HDX datasets"""

    def __init__(
        self,
        dataset_name: str,
        config: Optional[HDXConfig] = None,
        downloader: Optional[HDXDownloader] = None,
        reader: Optional[HDXReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        self._dataset_name = dataset_name
        super().__init__(
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> HDXConfig:
        """Create and return a HDXConfig instance"""
        return HDXConfig(
            dataset_name=self._dataset_name,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: HDXConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> HDXDownloader:
        """Create and return a HDXDownloader instance"""
        return HDXDownloader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_reader(
        self,
        config: HDXConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> HDXReader:
        """Create and return a HDXReader instance"""
        return HDXReader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )
create_config(data_store, logger, **kwargs)

Create and return a HDXConfig instance

Source code in gigaspatial/handlers/hdx.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> HDXConfig:
    """Create and return a HDXConfig instance"""
    return HDXConfig(
        dataset_name=self._dataset_name,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a HDXDownloader instance

Source code in gigaspatial/handlers/hdx.py
def create_downloader(
    self,
    config: HDXConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> HDXDownloader:
    """Create and return a HDXDownloader instance"""
    return HDXDownloader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a HDXReader instance

Source code in gigaspatial/handlers/hdx.py
def create_reader(
    self,
    config: HDXConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> HDXReader:
    """Create and return a HDXReader instance"""
    return HDXReader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )

HDXReader

Bases: BaseHandlerReader

Reader for HDX datasets

Source code in gigaspatial/handlers/hdx.py
class HDXReader(BaseHandlerReader):
    """Reader for HDX datasets"""

    def __init__(
        self,
        config: Optional[HDXConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, HDXConfig) else HDXConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

    def resolve_source_paths(
        self,
        source: Union[
            str,  # country code
            Dict,  # filter
            Path,  # path
            str,  # path
            List[Union[str, Path]],
        ],
        **kwargs,
    ) -> List[Union[str, Path]]:
        if isinstance(source, (str, Path)):
            # Could be a country code or a path
            if self.data_store.file_exists(str(source)) or str(source).endswith(
                (".csv", ".tif", ".json", ".parquet", ".gz", ".geojson", ".zip")
            ):
                source_data_paths = self.resolve_by_paths(source)
            else:
                source_data_paths = self.resolve_by_country(source, **kwargs)
        elif isinstance(source, Dict):
            resources = self.config.get_relevant_data_units(source=source, **kwargs)
            source_data_paths = self.config.get_data_unit_paths(resources, **kwargs)
        elif isinstance(source, Iterable) and all(
            isinstance(p, (str, Path)) for p in source
        ):
            source_data_paths = self.resolve_by_paths(source)
        else:
            raise NotImplementedError(f"Unsupported source type: {type(source)}")

        self.logger.info(f"Resolved {len(source_data_paths)} paths!")
        return source_data_paths

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> Any:
        """Load data from paths"""
        if len(source_data_path) == 1:
            return read_dataset(self.data_store, source_data_path[0])

        all_data = {}
        for file_path in source_data_path:
            try:
                all_data[file_path] = read_dataset(self.data_store, file_path)
            except Exception as e:
                raise ValueError(f"Could not read file {file_path}: {str(e)}")
        return all_data

    def load_all_resources(self):
        resources = self.config.list_resources()
        return self.load_from_paths(resources)
load_from_paths(source_data_path, **kwargs)

Load data from paths

Source code in gigaspatial/handlers/hdx.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> Any:
    """Load data from paths"""
    if len(source_data_path) == 1:
        return read_dataset(self.data_store, source_data_path[0])

    all_data = {}
    for file_path in source_data_path:
        try:
            all_data[file_path] = read_dataset(self.data_store, file_path)
        except Exception as e:
            raise ValueError(f"Could not read file {file_path}: {str(e)}")
    return all_data

mapbox_image

MapboxImageDownloader

Class to download images from Mapbox Static Images API using a specific style

Source code in gigaspatial/handlers/mapbox_image.py
class MapboxImageDownloader:
    """Class to download images from Mapbox Static Images API using a specific style"""

    BASE_URL = "https://api.mapbox.com/styles/v1"

    def __init__(
        self,
        access_token: str = config.MAPBOX_ACCESS_TOKEN,
        style_id: Optional[str] = None,
        data_store: Optional[DataStore] = None,
    ):
        """
        Initialize the downloader with Mapbox credentials

        Args:
            access_token: Mapbox access token
            style_id: Mapbox style ID to use for image download
            data_store: Instance of DataStore for accessing data storage
        """
        self.access_token = access_token
        self.style_id = style_id if style_id else "mapbox/satellite-v9"
        self.data_store = data_store or LocalDataStore()
        self.logger = config.get_logger(self.__class__.__name__)

    def _construct_url(self, bounds: Iterable[float], image_size: str) -> str:
        """Construct the Mapbox Static Images API URL"""
        bounds_str = f"[{','.join(map(str, bounds))}]"

        return (
            f"{self.BASE_URL}/{self.style_id}/static/{bounds_str}/{image_size}"
            f"?access_token={self.access_token}&attribution=false&logo=false"
        )

    def _download_single_image(self, url: str, output_path: Path) -> bool:
        """Download a single image from URL"""
        try:
            response = requests.get(url)
            response.raise_for_status()

            with self.data_store.open(str(output_path), "wb") as f:
                f.write(response.content)
            return True
        except Exception as e:
            self.logger.warning(f"Error downloading {output_path.name}: {str(e)}")
            return False

    def download_images_by_tiles(
        self,
        mercator_tiles: "MercatorTiles",
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        max_workers: int = 4,
        image_prefix: str = "image_",
    ) -> None:
        """
        Download images for given mercator tiles using the specified style

        Args:
            mercator_tiles: MercatorTiles instance containing quadkeys
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            max_workers: Maximum number of concurrent downloads
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)
        # self.data_store.makedirs(str(output_dir), exist_ok=True)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_tiles = len(mercator_tiles.quadkeys)

        self.logger.info(
            f"Downloading {total_tiles} tiles with size {image_size_str}..."
        )

        def _get_tile_bounds(quadkey: str) -> List[float]:
            """Get tile bounds from quadkey"""
            tile = mercantile.quadkey_to_tile(quadkey)
            bounds = mercantile.bounds(tile)
            return [bounds.west, bounds.south, bounds.east, bounds.north]

        def download_image(quadkey: str) -> bool:
            bounds = _get_tile_bounds(quadkey)
            file_name = f"{image_prefix}{quadkey}.png"

            url = self._construct_url(bounds, image_size_str)
            success = self._download_single_image(url, output_dir / file_name)

            return success

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(download_image, quadkey)
                for quadkey in mercator_tiles.quadkeys
            ]

            successful_downloads = 0
            with tqdm(total=total_tiles) as pbar:
                for future in as_completed(futures):
                    if future.result():
                        successful_downloads += 1
                    pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
        )

    def download_images_by_bounds(
        self,
        gdf: gpd.GeoDataFrame,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        max_workers: int = 4,
        image_prefix: str = "image_",
    ) -> None:
        """
        Download images for given points using the specified style

        Args:
            gdf_points: GeoDataFrame containing bounding box polygons
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            max_workers: Maximum number of concurrent downloads
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)
        # self.data_store.makedirs(str(output_dir), exist_ok=True)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_images = len(gdf)

        self.logger.info(
            f"Downloading {total_images} images with size {image_size_str}..."
        )

        def download_image(idx: Any, bounds: Tuple[float, float, float, float]) -> bool:
            file_name = f"{image_prefix}{idx}.png"
            url = self._construct_url(bounds, image_size_str)
            success = self._download_single_image(url, output_dir / file_name)
            return success

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(download_image, row.Index, row.geometry.bounds)
                for row in gdf.itertuples()
            ]

            successful_downloads = 0
            with tqdm(total=total_images) as pbar:
                for future in as_completed(futures):
                    if future.result():
                        successful_downloads += 1
                    pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_images} images!"
        )

    def download_images_by_coordinates(
        self,
        data: Union[pd.DataFrame, List[Tuple[float, float]]],
        res_meters_pixel: float,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        max_workers: int = 4,
        image_prefix: str = "image_",
    ) -> None:
        """
        Download images for given coordinates by creating bounded boxes around points

        Args:
            data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
            res_meters_pixel: Size of the bounding box in meters (creates a square)
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            max_workers: Maximum number of concurrent downloads
            image_prefix: Prefix for output image names
        """

        if isinstance(data, pd.DataFrame):
            coordinates_df = data
        else:
            coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

        gdf = convert_to_geodataframe(coordinates_df)

        buffered_gdf = buffer_geodataframe(
            gdf, res_meters_pixel / 2, cap_style="square"
        )

        self.download_images_by_bounds(
            buffered_gdf, output_dir, image_size, max_workers, image_prefix
        )
__init__(access_token=config.MAPBOX_ACCESS_TOKEN, style_id=None, data_store=None)

Initialize the downloader with Mapbox credentials

Parameters:

Name Type Description Default
access_token str

Mapbox access token

MAPBOX_ACCESS_TOKEN
style_id Optional[str]

Mapbox style ID to use for image download

None
data_store Optional[DataStore]

Instance of DataStore for accessing data storage

None
Source code in gigaspatial/handlers/mapbox_image.py
def __init__(
    self,
    access_token: str = config.MAPBOX_ACCESS_TOKEN,
    style_id: Optional[str] = None,
    data_store: Optional[DataStore] = None,
):
    """
    Initialize the downloader with Mapbox credentials

    Args:
        access_token: Mapbox access token
        style_id: Mapbox style ID to use for image download
        data_store: Instance of DataStore for accessing data storage
    """
    self.access_token = access_token
    self.style_id = style_id if style_id else "mapbox/satellite-v9"
    self.data_store = data_store or LocalDataStore()
    self.logger = config.get_logger(self.__class__.__name__)
download_images_by_bounds(gdf, output_dir, image_size=(512, 512), max_workers=4, image_prefix='image_')

Download images for given points using the specified style

Parameters:

Name Type Description Default
gdf_points

GeoDataFrame containing bounding box polygons

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
max_workers int

Maximum number of concurrent downloads

4
image_prefix str

Prefix for output image names

'image_'
Source code in gigaspatial/handlers/mapbox_image.py
def download_images_by_bounds(
    self,
    gdf: gpd.GeoDataFrame,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    max_workers: int = 4,
    image_prefix: str = "image_",
) -> None:
    """
    Download images for given points using the specified style

    Args:
        gdf_points: GeoDataFrame containing bounding box polygons
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        max_workers: Maximum number of concurrent downloads
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)
    # self.data_store.makedirs(str(output_dir), exist_ok=True)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_images = len(gdf)

    self.logger.info(
        f"Downloading {total_images} images with size {image_size_str}..."
    )

    def download_image(idx: Any, bounds: Tuple[float, float, float, float]) -> bool:
        file_name = f"{image_prefix}{idx}.png"
        url = self._construct_url(bounds, image_size_str)
        success = self._download_single_image(url, output_dir / file_name)
        return success

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(download_image, row.Index, row.geometry.bounds)
            for row in gdf.itertuples()
        ]

        successful_downloads = 0
        with tqdm(total=total_images) as pbar:
            for future in as_completed(futures):
                if future.result():
                    successful_downloads += 1
                pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_images} images!"
    )
download_images_by_coordinates(data, res_meters_pixel, output_dir, image_size=(512, 512), max_workers=4, image_prefix='image_')

Download images for given coordinates by creating bounded boxes around points

Parameters:

Name Type Description Default
data Union[DataFrame, List[Tuple[float, float]]]

Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples

required
res_meters_pixel float

Size of the bounding box in meters (creates a square)

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
max_workers int

Maximum number of concurrent downloads

4
image_prefix str

Prefix for output image names

'image_'
Source code in gigaspatial/handlers/mapbox_image.py
def download_images_by_coordinates(
    self,
    data: Union[pd.DataFrame, List[Tuple[float, float]]],
    res_meters_pixel: float,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    max_workers: int = 4,
    image_prefix: str = "image_",
) -> None:
    """
    Download images for given coordinates by creating bounded boxes around points

    Args:
        data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
        res_meters_pixel: Size of the bounding box in meters (creates a square)
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        max_workers: Maximum number of concurrent downloads
        image_prefix: Prefix for output image names
    """

    if isinstance(data, pd.DataFrame):
        coordinates_df = data
    else:
        coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

    gdf = convert_to_geodataframe(coordinates_df)

    buffered_gdf = buffer_geodataframe(
        gdf, res_meters_pixel / 2, cap_style="square"
    )

    self.download_images_by_bounds(
        buffered_gdf, output_dir, image_size, max_workers, image_prefix
    )
download_images_by_tiles(mercator_tiles, output_dir, image_size=(512, 512), max_workers=4, image_prefix='image_')

Download images for given mercator tiles using the specified style

Parameters:

Name Type Description Default
mercator_tiles MercatorTiles

MercatorTiles instance containing quadkeys

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
max_workers int

Maximum number of concurrent downloads

4
image_prefix str

Prefix for output image names

'image_'
Source code in gigaspatial/handlers/mapbox_image.py
def download_images_by_tiles(
    self,
    mercator_tiles: "MercatorTiles",
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    max_workers: int = 4,
    image_prefix: str = "image_",
) -> None:
    """
    Download images for given mercator tiles using the specified style

    Args:
        mercator_tiles: MercatorTiles instance containing quadkeys
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        max_workers: Maximum number of concurrent downloads
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)
    # self.data_store.makedirs(str(output_dir), exist_ok=True)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_tiles = len(mercator_tiles.quadkeys)

    self.logger.info(
        f"Downloading {total_tiles} tiles with size {image_size_str}..."
    )

    def _get_tile_bounds(quadkey: str) -> List[float]:
        """Get tile bounds from quadkey"""
        tile = mercantile.quadkey_to_tile(quadkey)
        bounds = mercantile.bounds(tile)
        return [bounds.west, bounds.south, bounds.east, bounds.north]

    def download_image(quadkey: str) -> bool:
        bounds = _get_tile_bounds(quadkey)
        file_name = f"{image_prefix}{quadkey}.png"

        url = self._construct_url(bounds, image_size_str)
        success = self._download_single_image(url, output_dir / file_name)

        return success

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(download_image, quadkey)
            for quadkey in mercator_tiles.quadkeys
        ]

        successful_downloads = 0
        with tqdm(total=total_tiles) as pbar:
            for future in as_completed(futures):
                if future.result():
                    successful_downloads += 1
                pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
    )

maxar_image

MaxarConfig

Bases: BaseModel

Configuration for Maxar Image Downloader using Pydantic

Source code in gigaspatial/handlers/maxar_image.py
class MaxarConfig(BaseModel):
    """Configuration for Maxar Image Downloader using Pydantic"""

    username: str = Field(
        default=global_config.MAXAR_USERNAME, description="Maxar API username"
    )
    password: str = Field(
        default=global_config.MAXAR_PASSWORD, description="Maxar API password"
    )
    connection_string: str = Field(
        default=global_config.MAXAR_CONNECTION_STRING,
        description="Maxar WMS connection string",
    )

    base_url: HttpUrl = Field(
        default="https://evwhs.digitalglobe.com/mapservice/wmsaccess?",
        description="Base URL for Maxar WMS service",
    )

    layers: List[Literal["DigitalGlobe:ImageryFootprint", "DigitalGlobe:Imagery"]] = (
        Field(
            default=["DigitalGlobe:Imagery"],
            description="List of layers to request from WMS",
        )
    )

    feature_profile: str = Field(
        default="Most_Aesthetic_Mosaic_Profile",
        description="Feature profile to use for WMS requests",
    )

    coverage_cql_filter: str = Field(
        default="", description="CQL filter for coverage selection"
    )

    exceptions: str = Field(
        default="application/vnd.ogc.se_xml",
        description="Exception handling format for WMS",
    )

    transparent: bool = Field(
        default=True,
        description="Whether the requested images should have transparency",
    )

    image_format: Literal["image/png", "image/jpeg", "image/geotiff"] = Field(
        default="image/png",
    )

    data_crs: Literal["EPSG:4326", "EPSG:3395", "EPSG:3857", "CAR:42004"] = Field(
        default="EPSG:4326"
    )

    max_retries: int = Field(
        default=3, description="Number of retries for failed image downloads"
    )

    retry_delay: int = Field(default=5, description="Delay in seconds between retries")

    @field_validator("username", "password", "connection_string")
    @classmethod
    def validate_non_empty(cls, value: str, field) -> str:
        """Ensure required credentials are provided"""
        if not value or value.strip() == "":
            raise ValueError(
                f"{field.name} cannot be empty. Please provide a valid {field.name}."
            )
        return value

    @property
    def wms_url(self) -> str:
        """Generate the full WMS URL with connection string"""
        return f"{self.base_url}connectid={self.connection_string}"

    @property
    def suffix(self) -> str:
        return f".{self.image_format.split('/')[1]}"
wms_url: str property

Generate the full WMS URL with connection string

validate_non_empty(value, field) classmethod

Ensure required credentials are provided

Source code in gigaspatial/handlers/maxar_image.py
@field_validator("username", "password", "connection_string")
@classmethod
def validate_non_empty(cls, value: str, field) -> str:
    """Ensure required credentials are provided"""
    if not value or value.strip() == "":
        raise ValueError(
            f"{field.name} cannot be empty. Please provide a valid {field.name}."
        )
    return value

MaxarImageDownloader

Class to download images from Maxar

Source code in gigaspatial/handlers/maxar_image.py
class MaxarImageDownloader:
    """Class to download images from Maxar"""

    def __init__(
        self,
        config: Optional[MaxarConfig] = None,
        data_store: Optional[DataStore] = None,
    ):
        """
        Initialize the downloader with Maxar config.

        Args:
            config: MaxarConfig instance containing credentials and settings
            data_store: Instance of DataStore for accessing data storage
        """
        self.config = config or MaxarConfig()
        self.wms = WebMapService(
            self.config.wms_url,
            username=self.config.username,
            password=self.config.password,
        )
        self.data_store = data_store or LocalDataStore()
        self.logger = global_config.get_logger(self.__class__.__name__)

    def _download_single_image(self, bbox, output_path: Union[Path, str], size) -> bool:
        """Download a single image from bbox and pixel size"""
        for attempt in range(self.config.max_retries):
            try:
                img_data = self.wms.getmap(
                    bbox=bbox,
                    layers=self.config.layers,
                    srs=self.config.data_crs,
                    size=size,
                    featureProfile=self.config.feature_profile,
                    coverage_cql_filter=self.config.coverage_cql_filter,
                    exceptions=self.config.exceptions,
                    transparent=self.config.transparent,
                    format=self.config.image_format,
                )
                self.data_store.write_file(str(output_path), img_data.read())
                return True
            except Exception as e:
                self.logger.warning(
                    f"Attempt {attempt + 1} of downloading {output_path.name} failed: {str(e)}"
                )
                if attempt < self.config.max_retries - 1:
                    sleep(self.config.retry_delay)
                else:
                    self.logger.warning(
                        f"Failed to download {output_path.name} after {self.config.max_retries} attemps: {str(e)}"
                    )
                    return False

    def download_images_by_tiles(
        self,
        mercator_tiles: "MercatorTiles",
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        image_prefix: str = "maxar_image_",
    ) -> None:
        """
        Download images for given mercator tiles using the specified style

        Args:
            mercator_tiles: MercatorTiles instance containing quadkeys
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_tiles = len(mercator_tiles.quadkeys)

        self.logger.info(
            f"Downloading {total_tiles} tiles with size {image_size_str}..."
        )

        def _get_tile_bounds(quadkey: str) -> Tuple[float]:
            """Get tile bounds from quadkey"""
            tile = mercantile.quadkey_to_tile(quadkey)
            bounds = mercantile.bounds(tile)
            return (bounds.west, bounds.south, bounds.east, bounds.north)

        def download_image(
            quadkey: str, image_size: Tuple[int, int], suffix: str = self.config.suffix
        ) -> bool:
            bounds = _get_tile_bounds(quadkey)
            file_name = f"{image_prefix}{quadkey}{suffix}"

            success = self._download_single_image(
                bounds, output_dir / file_name, image_size
            )

            return success

        successful_downloads = 0
        with tqdm(total=total_tiles) as pbar:
            for quadkey in mercator_tiles.quadkeys:
                if download_image(quadkey, image_size):
                    successful_downloads += 1
                pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
        )

    def download_images_by_bounds(
        self,
        gdf: gpd.GeoDataFrame,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        image_prefix: str = "maxar_image_",
    ) -> None:
        """
        Download images for given points using the specified style

        Args:
            gdf_points: GeoDataFrame containing bounding box polygons
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            image_prefix: Prefix for output image names
        """
        output_dir = Path(output_dir)

        image_size_str = f"{image_size[0]}x{image_size[1]}"
        total_images = len(gdf)

        self.logger.info(
            f"Downloading {total_images} images with size {image_size_str}..."
        )

        def download_image(
            idx: Any,
            bounds: Tuple[float, float, float, float],
            image_size,
            suffix: str = self.config.suffix,
        ) -> bool:
            file_name = f"{image_prefix}{idx}{suffix}"
            success = self._download_single_image(
                bounds, output_dir / file_name, image_size
            )
            return success

        gdf = gdf.to_crs(self.config.data_crs)

        successful_downloads = 0
        with tqdm(total=total_images) as pbar:
            for row in gdf.itertuples():
                if download_image(row.Index, tuple(row.geometry.bounds), image_size):
                    successful_downloads += 1
                pbar.update(1)

        self.logger.info(
            f"Successfully downloaded {successful_downloads}/{total_images} images!"
        )

    def download_images_by_coordinates(
        self,
        data: Union[pd.DataFrame, List[Tuple[float, float]]],
        res_meters_pixel: float,
        output_dir: Union[str, Path],
        image_size: Tuple[int, int] = (512, 512),
        image_prefix: str = "maxar_image_",
    ) -> None:
        """
        Download images for given coordinates by creating bounded boxes around points

        Args:
            data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
            res_meters_pixel: resolution in meters per pixel
            output_dir: Directory to save images
            image_size: Tuple of (width, height) for output images
            image_prefix: Prefix for output image names
        """

        if isinstance(data, pd.DataFrame):
            coordinates_df = data
        else:
            coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

        gdf = convert_to_geodataframe(coordinates_df)

        buffered_gdf = buffer_geodataframe(
            gdf, res_meters_pixel / 2, cap_style="square"
        )

        buffered_gdf = buffered_gdf.to_crs(self.config.data_crs)

        self.download_images_by_bounds(
            buffered_gdf, output_dir, image_size, image_prefix
        )
__init__(config=None, data_store=None)

Initialize the downloader with Maxar config.

Parameters:

Name Type Description Default
config Optional[MaxarConfig]

MaxarConfig instance containing credentials and settings

None
data_store Optional[DataStore]

Instance of DataStore for accessing data storage

None
Source code in gigaspatial/handlers/maxar_image.py
def __init__(
    self,
    config: Optional[MaxarConfig] = None,
    data_store: Optional[DataStore] = None,
):
    """
    Initialize the downloader with Maxar config.

    Args:
        config: MaxarConfig instance containing credentials and settings
        data_store: Instance of DataStore for accessing data storage
    """
    self.config = config or MaxarConfig()
    self.wms = WebMapService(
        self.config.wms_url,
        username=self.config.username,
        password=self.config.password,
    )
    self.data_store = data_store or LocalDataStore()
    self.logger = global_config.get_logger(self.__class__.__name__)
download_images_by_bounds(gdf, output_dir, image_size=(512, 512), image_prefix='maxar_image_')

Download images for given points using the specified style

Parameters:

Name Type Description Default
gdf_points

GeoDataFrame containing bounding box polygons

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
image_prefix str

Prefix for output image names

'maxar_image_'
Source code in gigaspatial/handlers/maxar_image.py
def download_images_by_bounds(
    self,
    gdf: gpd.GeoDataFrame,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    image_prefix: str = "maxar_image_",
) -> None:
    """
    Download images for given points using the specified style

    Args:
        gdf_points: GeoDataFrame containing bounding box polygons
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_images = len(gdf)

    self.logger.info(
        f"Downloading {total_images} images with size {image_size_str}..."
    )

    def download_image(
        idx: Any,
        bounds: Tuple[float, float, float, float],
        image_size,
        suffix: str = self.config.suffix,
    ) -> bool:
        file_name = f"{image_prefix}{idx}{suffix}"
        success = self._download_single_image(
            bounds, output_dir / file_name, image_size
        )
        return success

    gdf = gdf.to_crs(self.config.data_crs)

    successful_downloads = 0
    with tqdm(total=total_images) as pbar:
        for row in gdf.itertuples():
            if download_image(row.Index, tuple(row.geometry.bounds), image_size):
                successful_downloads += 1
            pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_images} images!"
    )
download_images_by_coordinates(data, res_meters_pixel, output_dir, image_size=(512, 512), image_prefix='maxar_image_')

Download images for given coordinates by creating bounded boxes around points

Parameters:

Name Type Description Default
data Union[DataFrame, List[Tuple[float, float]]]

Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples

required
res_meters_pixel float

resolution in meters per pixel

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
image_prefix str

Prefix for output image names

'maxar_image_'
Source code in gigaspatial/handlers/maxar_image.py
def download_images_by_coordinates(
    self,
    data: Union[pd.DataFrame, List[Tuple[float, float]]],
    res_meters_pixel: float,
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    image_prefix: str = "maxar_image_",
) -> None:
    """
    Download images for given coordinates by creating bounded boxes around points

    Args:
        data: Either a DataFrame with either latitude/longitude columns or a geometry column or a list of (lat, lon) tuples
        res_meters_pixel: resolution in meters per pixel
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        image_prefix: Prefix for output image names
    """

    if isinstance(data, pd.DataFrame):
        coordinates_df = data
    else:
        coordinates_df = pd.DataFrame(data, columns=["latitude", "longitude"])

    gdf = convert_to_geodataframe(coordinates_df)

    buffered_gdf = buffer_geodataframe(
        gdf, res_meters_pixel / 2, cap_style="square"
    )

    buffered_gdf = buffered_gdf.to_crs(self.config.data_crs)

    self.download_images_by_bounds(
        buffered_gdf, output_dir, image_size, image_prefix
    )
download_images_by_tiles(mercator_tiles, output_dir, image_size=(512, 512), image_prefix='maxar_image_')

Download images for given mercator tiles using the specified style

Parameters:

Name Type Description Default
mercator_tiles MercatorTiles

MercatorTiles instance containing quadkeys

required
output_dir Union[str, Path]

Directory to save images

required
image_size Tuple[int, int]

Tuple of (width, height) for output images

(512, 512)
image_prefix str

Prefix for output image names

'maxar_image_'
Source code in gigaspatial/handlers/maxar_image.py
def download_images_by_tiles(
    self,
    mercator_tiles: "MercatorTiles",
    output_dir: Union[str, Path],
    image_size: Tuple[int, int] = (512, 512),
    image_prefix: str = "maxar_image_",
) -> None:
    """
    Download images for given mercator tiles using the specified style

    Args:
        mercator_tiles: MercatorTiles instance containing quadkeys
        output_dir: Directory to save images
        image_size: Tuple of (width, height) for output images
        image_prefix: Prefix for output image names
    """
    output_dir = Path(output_dir)

    image_size_str = f"{image_size[0]}x{image_size[1]}"
    total_tiles = len(mercator_tiles.quadkeys)

    self.logger.info(
        f"Downloading {total_tiles} tiles with size {image_size_str}..."
    )

    def _get_tile_bounds(quadkey: str) -> Tuple[float]:
        """Get tile bounds from quadkey"""
        tile = mercantile.quadkey_to_tile(quadkey)
        bounds = mercantile.bounds(tile)
        return (bounds.west, bounds.south, bounds.east, bounds.north)

    def download_image(
        quadkey: str, image_size: Tuple[int, int], suffix: str = self.config.suffix
    ) -> bool:
        bounds = _get_tile_bounds(quadkey)
        file_name = f"{image_prefix}{quadkey}{suffix}"

        success = self._download_single_image(
            bounds, output_dir / file_name, image_size
        )

        return success

    successful_downloads = 0
    with tqdm(total=total_tiles) as pbar:
        for quadkey in mercator_tiles.quadkeys:
            if download_image(quadkey, image_size):
                successful_downloads += 1
            pbar.update(1)

    self.logger.info(
        f"Successfully downloaded {successful_downloads}/{total_tiles} images!"
    )

microsoft_global_buildings

MSBuildingsConfig dataclass

Bases: BaseHandlerConfig

Configuration for Microsoft Global Buildings dataset files.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class MSBuildingsConfig(BaseHandlerConfig):
    """Configuration for Microsoft Global Buildings dataset files."""

    TILE_URLS: str = (
        "https://minedbuildings.z5.web.core.windows.net/global-buildings/dataset-links.csv"
    )
    MERCATOR_ZOOM_LEVEL: int = 9
    base_path: Path = global_config.get_path("microsoft_global_buildings", "bronze")

    LOCATION_MAPPING_FILE: Path = base_path / "location_mapping.json"
    SIMILARITY_SCORE: float = 0.8
    DEFAULT_MAPPING: Dict[str, str] = field(
        default_factory=lambda: {
            "Bonaire": "BES",
            "Brunei": "BRN",
            "IvoryCoast": "CIV",
            "CongoDRC": "COD",
            "DemocraticRepublicoftheCongo": "COD",
            "RepublicoftheCongo": "COG",
            "TheGambia": "GMB",
            "FYROMakedonija": "MKD",
            "SultanateofOman": "OMN",
            "StateofQatar": "QAT",
            "Russia": "RUS",
            "KingdomofSaudiArabia": "SAU",
            "Svalbard": "SJM",
            "Swaziland": "SWZ",
            "StMartin": "SXM",
            "leSaint-Martin": "MAF",
            "Turkey": "TUR",
            "VaticanCity": "VAT",
            "BritishVirginIslands": "VGB",
            "USVirginIslands": "VIR",
            "RepublicofYemen": "YEM",
            "CzechRepublic": "CZE",
            "French-Martinique": "MTQ",
            "French-Guadeloupe": "GLP",
            "UnitedStates": "USA",
        }
    )
    CUSTOM_MAPPING: Optional[Dict[str, str]] = None

    def __post_init__(self):
        """Initialize the configuration, load tile URLs, and set up location mapping."""
        super().__post_init__()
        self._load_tile_urls()
        self.upload_date = self.df_tiles.upload_date[0]
        self._setup_location_mapping()

    def _load_tile_urls(self):
        """Load dataset links from csv file."""
        self.df_tiles = pd.read_csv(
            self.TILE_URLS,
            names=["location", "quadkey", "url", "size", "upload_date"],
            dtype={"quadkey": str},
            header=0,
        )

    def _setup_location_mapping(self):
        """Load or create the mapping between dataset locations and ISO country codes."""
        from gigaspatial.core.io.readers import read_json
        from gigaspatial.core.io.writers import write_json

        if self.data_store.file_exists(str(self.LOCATION_MAPPING_FILE)):
            self.location_mapping = read_json(
                self.data_store, str(self.LOCATION_MAPPING_FILE)
            )
        else:
            self.location_mapping = self.create_location_mapping(
                similarity_score_threshold=self.SIMILARITY_SCORE
            )
            self.location_mapping.update(self.DEFAULT_MAPPING)
            write_json(
                self.location_mapping, self.data_store, str(self.LOCATION_MAPPING_FILE)
            )

        self.location_mapping.update(self.CUSTOM_MAPPING or {})
        self._map_locations()
        self.df_tiles.loc[self.df_tiles.country.isnull(), "country"] = None

    def _map_locations(self):
        """Map the 'location' column in the tiles DataFrame to ISO country codes."""
        self.df_tiles["country"] = self.df_tiles.location.map(self.location_mapping)

    def create_location_mapping(self, similarity_score_threshold: float = 0.8):
        """
        Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.

        This function iterates through known countries and attempts to find matching
        locations in the dataset based on string similarity.

        Args:
            similarity_score_threshold: The minimum similarity score (between 0 and 1)
                                        for a dataset location to be considered a match
                                        for a country. Defaults to 0.8.

        Returns:
            A dictionary where keys are dataset location names and values are
            the corresponding ISO 3166-1 alpha-3 country codes.
        """

        def similar(a, b):
            return SequenceMatcher(None, a, b).ratio()

        location_mapping = dict()

        for country in pycountry.countries:
            if country.name not in self.df_tiles.location.unique():
                try:
                    country_quadkey = CountryMercatorTiles.create(
                        country.alpha_3, self.MERCATOR_ZOOM_LEVEL
                    )
                except:
                    self.logger.warning(f"{country.name} is not mapped.")
                    continue
                country_datasets = country_quadkey.filter_quadkeys(
                    self.df_tiles.quadkey
                )
                matching_locations = self.df_tiles[
                    self.df_tiles.quadkey.isin(country_datasets.quadkeys)
                ].location.unique()
                scores = np.array(
                    [
                        (
                            similar(c, country.common_name)
                            if hasattr(country, "common_name")
                            else similar(c, country.name)
                        )
                        for c in matching_locations
                    ]
                )
                if any(scores > similarity_score_threshold):
                    matched = matching_locations[scores > similarity_score_threshold]
                    if len(matched) > 2:
                        self.logger.warning(
                            f"Multiple matches exist for {country.name}. {country.name} is not mapped."
                        )
                    location_mapping[matched[0]] = country.alpha_3
                    self.logger.debug(f"{country.name} matched with {matched[0]}!")
                else:
                    self.logger.warning(
                        f"No direct matches for {country.name}. {country.name} is not mapped."
                    )
                    self.logger.debug("Possible matches are: ")
                    for c, score in zip(matching_locations, scores):
                        self.logger.debug(c, score)
            else:
                location_mapping[country.name] = country.alpha_3

        return location_mapping

    def get_relevant_data_units_by_geometry(
        self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
    ) -> pd.DataFrame:
        """
        Return intersecting tiles for a given geometry or GeoDataFrame.
        """
        return self._get_relevant_tiles(geometry)

    def get_relevant_data_units_by_points(
        self, points: Iterable[Union[Point, tuple]], **kwargs
    ) -> pd.DataFrame:
        """
        Return intersecting tiles for a list of points.
        """
        return self._get_relevant_tiles(points)

    def get_relevant_data_units_by_country(
        self, country: str, **kwargs
    ) -> pd.DataFrame:
        """
        Return intersecting tiles for a given country.
        """
        return self._get_relevant_tiles(country)

    def get_data_unit_path(self, unit: Union[pd.Series, dict], **kwargs) -> Path:

        tile_location = unit["country"] if unit["country"] else unit["location"]

        return (
            self.base_path
            / tile_location
            / self.upload_date
            / f'{unit["quadkey"]}.csv.gz'
        )

    def get_data_unit_paths(
        self, units: Union[pd.DataFrame, Iterable[dict]], **kwargs
    ) -> List:
        if isinstance(units, pd.DataFrame):
            return [self.get_data_unit_path(row) for _, row in units.iterrows()]
        return super().get_data_unit_paths(units)

    def _get_relevant_tiles(
        self,
        source: Union[
            str,  # country
            BaseGeometry,  # shapely geoms
            gpd.GeoDataFrame,
            Iterable[Union[Point, Tuple[float, float]]],  # points
        ],
    ) -> pd.DataFrame:
        """
        Get the DataFrame of Microsoft Buildings tiles that intersect with a given source spatial geometry.

        In case country given, this method first tries to find tiles directly mapped to the given country.
        If no directly mapped tiles are found and the country is not in the location
        mapping, it attempts to find overlapping tiles by creating Mercator tiles
        for the country and filtering the dataset's tiles.

        Args:
            source: A country code/name, a Shapely geometry, a GeoDataFrame, or a list of Point
                    objects or (lat, lon) tuples representing the area of interest.
                    The coordinates are assumed to be in EPSG:4326.

        Returns:
            A pandas DataFrame containing the rows from the tiles list that
            spatially intersect with the `source`. Returns an empty DataFrame
            if no intersecting tiles are found.
        """
        if isinstance(source, str):
            try:
                country_code = pycountry.countries.lookup(source).alpha_3
            except:
                raise ValueError("Invalid`country` value!")

            mask = self.df_tiles["country"] == country_code

            if any(mask):
                return self.df_tiles.loc[
                    mask, ["quadkey", "url", "country", "location"]
                ].to_dict("records")

            self.logger.warning(
                f"The country code '{country_code}' is not directly in the location mapping. "
                "Manually checking for overlapping locations with the country boundary."
            )

            source_tiles = CountryMercatorTiles.create(
                country_code, self.MERCATOR_ZOOM_LEVEL
            )
        else:
            source_tiles = MercatorTiles.from_spatial(
                source=source, zoom_level=self.MERCATOR_ZOOM_LEVEL
            )

        filtered_tiles = source_tiles.filter_quadkeys(self.df_tiles.quadkey)

        mask = self.df_tiles.quadkey.isin(filtered_tiles.quadkeys)

        return self.df_tiles.loc[
            mask, ["quadkey", "url", "country", "location"]
        ].to_dict("records")
__post_init__()

Initialize the configuration, load tile URLs, and set up location mapping.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def __post_init__(self):
    """Initialize the configuration, load tile URLs, and set up location mapping."""
    super().__post_init__()
    self._load_tile_urls()
    self.upload_date = self.df_tiles.upload_date[0]
    self._setup_location_mapping()
create_location_mapping(similarity_score_threshold=0.8)

Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.

This function iterates through known countries and attempts to find matching locations in the dataset based on string similarity.

Parameters:

Name Type Description Default
similarity_score_threshold float

The minimum similarity score (between 0 and 1) for a dataset location to be considered a match for a country. Defaults to 0.8.

0.8

Returns:

Type Description

A dictionary where keys are dataset location names and values are

the corresponding ISO 3166-1 alpha-3 country codes.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_location_mapping(self, similarity_score_threshold: float = 0.8):
    """
    Create a mapping between the dataset's location names and ISO 3166-1 alpha-3 country codes.

    This function iterates through known countries and attempts to find matching
    locations in the dataset based on string similarity.

    Args:
        similarity_score_threshold: The minimum similarity score (between 0 and 1)
                                    for a dataset location to be considered a match
                                    for a country. Defaults to 0.8.

    Returns:
        A dictionary where keys are dataset location names and values are
        the corresponding ISO 3166-1 alpha-3 country codes.
    """

    def similar(a, b):
        return SequenceMatcher(None, a, b).ratio()

    location_mapping = dict()

    for country in pycountry.countries:
        if country.name not in self.df_tiles.location.unique():
            try:
                country_quadkey = CountryMercatorTiles.create(
                    country.alpha_3, self.MERCATOR_ZOOM_LEVEL
                )
            except:
                self.logger.warning(f"{country.name} is not mapped.")
                continue
            country_datasets = country_quadkey.filter_quadkeys(
                self.df_tiles.quadkey
            )
            matching_locations = self.df_tiles[
                self.df_tiles.quadkey.isin(country_datasets.quadkeys)
            ].location.unique()
            scores = np.array(
                [
                    (
                        similar(c, country.common_name)
                        if hasattr(country, "common_name")
                        else similar(c, country.name)
                    )
                    for c in matching_locations
                ]
            )
            if any(scores > similarity_score_threshold):
                matched = matching_locations[scores > similarity_score_threshold]
                if len(matched) > 2:
                    self.logger.warning(
                        f"Multiple matches exist for {country.name}. {country.name} is not mapped."
                    )
                location_mapping[matched[0]] = country.alpha_3
                self.logger.debug(f"{country.name} matched with {matched[0]}!")
            else:
                self.logger.warning(
                    f"No direct matches for {country.name}. {country.name} is not mapped."
                )
                self.logger.debug("Possible matches are: ")
                for c, score in zip(matching_locations, scores):
                    self.logger.debug(c, score)
        else:
            location_mapping[country.name] = country.alpha_3

    return location_mapping
get_relevant_data_units_by_country(country, **kwargs)

Return intersecting tiles for a given country.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def get_relevant_data_units_by_country(
    self, country: str, **kwargs
) -> pd.DataFrame:
    """
    Return intersecting tiles for a given country.
    """
    return self._get_relevant_tiles(country)
get_relevant_data_units_by_geometry(geometry, **kwargs)

Return intersecting tiles for a given geometry or GeoDataFrame.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def get_relevant_data_units_by_geometry(
    self, geometry: Union[BaseGeometry, gpd.GeoDataFrame], **kwargs
) -> pd.DataFrame:
    """
    Return intersecting tiles for a given geometry or GeoDataFrame.
    """
    return self._get_relevant_tiles(geometry)
get_relevant_data_units_by_points(points, **kwargs)

Return intersecting tiles for a list of points.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def get_relevant_data_units_by_points(
    self, points: Iterable[Union[Point, tuple]], **kwargs
) -> pd.DataFrame:
    """
    Return intersecting tiles for a list of points.
    """
    return self._get_relevant_tiles(points)

MSBuildingsDownloader

Bases: BaseHandlerDownloader

A class to handle downloads of Microsoft's Global ML Building Footprints dataset.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
class MSBuildingsDownloader(BaseHandlerDownloader):
    """A class to handle downloads of Microsoft's Global ML Building Footprints dataset."""

    def __init__(
        self,
        config: Optional[MSBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Optional configuration for customizing download behavior and file paths.
                    If None, a default `MSBuildingsConfig` is used.
            data_store: Optional instance of a `DataStore` for managing data storage.
                        If provided, it overrides the `data_store` in the `config`.
                        If None, the `data_store` from the `config` is used.
            logger: Optional custom logger instance. If None, a default logger
                    named after the module is created and used.
        """
        config = config or MSBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def download_data_unit(
        self,
        tile_info: Union[pd.Series, dict],
        **kwargs,
    ) -> Optional[str]:
        """Download data file for a single tile."""

        tile_url = tile_info["url"]

        try:
            response = requests.get(tile_url, stream=True)
            response.raise_for_status()

            file_path = str(self.config.get_data_unit_path(tile_info))

            with self.data_store.open(file_path, "wb") as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)

                self.logger.debug(
                    f"Successfully downloaded tile: {tile_info['quadkey']}"
                )
                return file_path

        except requests.exceptions.RequestException as e:
            self.logger.error(
                f"Failed to download tile {tile_info['quadkey']}: {str(e)}"
            )
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
            return None

    def download_data_units(
        self,
        tiles: Union[pd.DataFrame, List[dict]],
        **kwargs,
    ) -> List[str]:
        """Download data files for multiple tiles."""

        if len(tiles) == 0:
            self.logger.warning(f"There is no matching data")
            return []

        with multiprocessing.Pool(self.config.n_workers) as pool:
            download_func = functools.partial(self.download_data_unit)
            file_paths = list(
                tqdm(
                    pool.imap(
                        download_func,
                        (
                            [row for _, row in tiles.iterrows()]
                            if isinstance(tiles, pd.DataFrame)
                            else tiles
                        ),
                    ),
                    total=len(tiles),
                    desc=f"Downloading polygons data",
                )
            )

        return [path for path in file_paths if path is not None]

    def download(
        self,
        source: Union[
            str,  # country
            List[Union[Tuple[float, float], Point]],  # points
            BaseGeometry,  # shapely geoms
            gpd.GeoDataFrame,
        ],
        **kwargs,
    ) -> List[str]:
        """
        Download Microsoft Global ML Building Footprints data for a specified geographic region.

        The region can be defined by a country, a list of points,
        a Shapely geometry, or a GeoDataFrame. This method identifies the
        relevant data tiles intersecting the region and downloads them in parallel.

        Args:
            source: Defines the geographic area for which to download data.
                    Can be:
                      - A string representing a country code or name.
                      - A list of (latitude, longitude) tuples or Shapely Point objects.
                      - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                      - A GeoDataFrame with a geometry column in EPSG:4326.
            **kwargs: Additional parameters passed to data unit resolution methods

        Returns:
            A list of local file paths for the successfully downloaded tiles.
            Returns an empty list if no data is found for the region or if
            all downloads fail.
        """

        tiles = self.config.get_relevant_data_units(source, **kwargs)
        return self.download_data_units(tiles, **kwargs)

    def download_by_country(
        self,
        country: str,
        data_store: Optional[DataStore] = None,
        country_geom_path: Optional[Union[str, Path]] = None,
    ) -> List[str]:
        """
        Download Microsoft Global ML Building Footprints data for a specific country.

        This is a convenience method to download data for an entire country
        using its code or name.

        Args:
            country: The country code (e.g., 'USA', 'GBR') or name.
            data_store: Optional instance of a `DataStore` to be used by
                `AdminBoundaries` for loading country boundaries. If None,
                `AdminBoundaries` will use its default data loading.
            country_geom_path: Optional path to a GeoJSON file containing the
                country boundary. If provided, this boundary is used
                instead of the default from `AdminBoundaries`.

        Returns:
            A list of local file paths for the successfully downloaded tiles.
            Returns an empty list if no data is found for the country or if
            all downloads fail.
        """
        return self.download(
            source=country, data_store=data_store, path=country_geom_path
        )
__init__(config=None, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Optional[MSBuildingsConfig]

Optional configuration for customizing download behavior and file paths. If None, a default MSBuildingsConfig is used.

None
data_store Optional[DataStore]

Optional instance of a DataStore for managing data storage. If provided, it overrides the data_store in the config. If None, the data_store from the config is used.

None
logger Optional[Logger]

Optional custom logger instance. If None, a default logger named after the module is created and used.

None
Source code in gigaspatial/handlers/microsoft_global_buildings.py
def __init__(
    self,
    config: Optional[MSBuildingsConfig] = None,
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Optional configuration for customizing download behavior and file paths.
                If None, a default `MSBuildingsConfig` is used.
        data_store: Optional instance of a `DataStore` for managing data storage.
                    If provided, it overrides the `data_store` in the `config`.
                    If None, the `data_store` from the `config` is used.
        logger: Optional custom logger instance. If None, a default logger
                named after the module is created and used.
    """
    config = config or MSBuildingsConfig()
    super().__init__(config=config, data_store=data_store, logger=logger)
download(source, **kwargs)

Download Microsoft Global ML Building Footprints data for a specified geographic region.

The region can be defined by a country, a list of points, a Shapely geometry, or a GeoDataFrame. This method identifies the relevant data tiles intersecting the region and downloads them in parallel.

Parameters:

Name Type Description Default
source Union[str, List[Union[Tuple[float, float], Point]], BaseGeometry, GeoDataFrame]

Defines the geographic area for which to download data. Can be: - A string representing a country code or name. - A list of (latitude, longitude) tuples or Shapely Point objects. - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon). - A GeoDataFrame with a geometry column in EPSG:4326.

required
**kwargs

Additional parameters passed to data unit resolution methods

{}

Returns:

Type Description
List[str]

A list of local file paths for the successfully downloaded tiles.

List[str]

Returns an empty list if no data is found for the region or if

List[str]

all downloads fail.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download(
    self,
    source: Union[
        str,  # country
        List[Union[Tuple[float, float], Point]],  # points
        BaseGeometry,  # shapely geoms
        gpd.GeoDataFrame,
    ],
    **kwargs,
) -> List[str]:
    """
    Download Microsoft Global ML Building Footprints data for a specified geographic region.

    The region can be defined by a country, a list of points,
    a Shapely geometry, or a GeoDataFrame. This method identifies the
    relevant data tiles intersecting the region and downloads them in parallel.

    Args:
        source: Defines the geographic area for which to download data.
                Can be:
                  - A string representing a country code or name.
                  - A list of (latitude, longitude) tuples or Shapely Point objects.
                  - A Shapely BaseGeometry object (e.g., Polygon, MultiPolygon).
                  - A GeoDataFrame with a geometry column in EPSG:4326.
        **kwargs: Additional parameters passed to data unit resolution methods

    Returns:
        A list of local file paths for the successfully downloaded tiles.
        Returns an empty list if no data is found for the region or if
        all downloads fail.
    """

    tiles = self.config.get_relevant_data_units(source, **kwargs)
    return self.download_data_units(tiles, **kwargs)
download_by_country(country, data_store=None, country_geom_path=None)

Download Microsoft Global ML Building Footprints data for a specific country.

This is a convenience method to download data for an entire country using its code or name.

Parameters:

Name Type Description Default
country str

The country code (e.g., 'USA', 'GBR') or name.

required
data_store Optional[DataStore]

Optional instance of a DataStore to be used by AdminBoundaries for loading country boundaries. If None, AdminBoundaries will use its default data loading.

None
country_geom_path Optional[Union[str, Path]]

Optional path to a GeoJSON file containing the country boundary. If provided, this boundary is used instead of the default from AdminBoundaries.

None

Returns:

Type Description
List[str]

A list of local file paths for the successfully downloaded tiles.

List[str]

Returns an empty list if no data is found for the country or if

List[str]

all downloads fail.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download_by_country(
    self,
    country: str,
    data_store: Optional[DataStore] = None,
    country_geom_path: Optional[Union[str, Path]] = None,
) -> List[str]:
    """
    Download Microsoft Global ML Building Footprints data for a specific country.

    This is a convenience method to download data for an entire country
    using its code or name.

    Args:
        country: The country code (e.g., 'USA', 'GBR') or name.
        data_store: Optional instance of a `DataStore` to be used by
            `AdminBoundaries` for loading country boundaries. If None,
            `AdminBoundaries` will use its default data loading.
        country_geom_path: Optional path to a GeoJSON file containing the
            country boundary. If provided, this boundary is used
            instead of the default from `AdminBoundaries`.

    Returns:
        A list of local file paths for the successfully downloaded tiles.
        Returns an empty list if no data is found for the country or if
        all downloads fail.
    """
    return self.download(
        source=country, data_store=data_store, path=country_geom_path
    )
download_data_unit(tile_info, **kwargs)

Download data file for a single tile.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download_data_unit(
    self,
    tile_info: Union[pd.Series, dict],
    **kwargs,
) -> Optional[str]:
    """Download data file for a single tile."""

    tile_url = tile_info["url"]

    try:
        response = requests.get(tile_url, stream=True)
        response.raise_for_status()

        file_path = str(self.config.get_data_unit_path(tile_info))

        with self.data_store.open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

            self.logger.debug(
                f"Successfully downloaded tile: {tile_info['quadkey']}"
            )
            return file_path

    except requests.exceptions.RequestException as e:
        self.logger.error(
            f"Failed to download tile {tile_info['quadkey']}: {str(e)}"
        )
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
        return None
download_data_units(tiles, **kwargs)

Download data files for multiple tiles.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def download_data_units(
    self,
    tiles: Union[pd.DataFrame, List[dict]],
    **kwargs,
) -> List[str]:
    """Download data files for multiple tiles."""

    if len(tiles) == 0:
        self.logger.warning(f"There is no matching data")
        return []

    with multiprocessing.Pool(self.config.n_workers) as pool:
        download_func = functools.partial(self.download_data_unit)
        file_paths = list(
            tqdm(
                pool.imap(
                    download_func,
                    (
                        [row for _, row in tiles.iterrows()]
                        if isinstance(tiles, pd.DataFrame)
                        else tiles
                    ),
                ),
                total=len(tiles),
                desc=f"Downloading polygons data",
            )
        )

    return [path for path in file_paths if path is not None]

MSBuildingsHandler

Bases: BaseHandler

Handler for Microsoft Global Buildings dataset.

This class provides a unified interface for downloading and loading Microsoft Global Buildings data. It manages the lifecycle of configuration, downloading, and reading components.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
class MSBuildingsHandler(BaseHandler):
    """
    Handler for Microsoft Global Buildings dataset.

    This class provides a unified interface for downloading and loading Microsoft Global Buildings data.
    It manages the lifecycle of configuration, downloading, and reading components.
    """

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> MSBuildingsConfig:
        """
        Create and return a MSBuildingsConfig instance.

        Args:
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional configuration parameters

        Returns:
            Configured MSBuildingsConfig instance
        """
        return MSBuildingsConfig(data_store=data_store, logger=logger, **kwargs)

    def create_downloader(
        self,
        config: MSBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> MSBuildingsDownloader:
        """
        Create and return a MSBuildingsDownloader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional downloader parameters

        Returns:
            Configured MSBuildingsDownloader instance
        """
        return MSBuildingsDownloader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )

    def create_reader(
        self,
        config: MSBuildingsConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> MSBuildingsReader:
        """
        Create and return a MSBuildingsReader instance.

        Args:
            config: The configuration object
            data_store: The data store instance to use
            logger: The logger instance to use
            **kwargs: Additional reader parameters

        Returns:
            Configured MSBuildingsReader instance
        """
        return MSBuildingsReader(
            config=config, data_store=data_store, logger=logger, **kwargs
        )
create_config(data_store, logger, **kwargs)

Create and return a MSBuildingsConfig instance.

Parameters:

Name Type Description Default
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional configuration parameters

{}

Returns:

Type Description
MSBuildingsConfig

Configured MSBuildingsConfig instance

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> MSBuildingsConfig:
    """
    Create and return a MSBuildingsConfig instance.

    Args:
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional configuration parameters

    Returns:
        Configured MSBuildingsConfig instance
    """
    return MSBuildingsConfig(data_store=data_store, logger=logger, **kwargs)
create_downloader(config, data_store, logger, **kwargs)

Create and return a MSBuildingsDownloader instance.

Parameters:

Name Type Description Default
config MSBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional downloader parameters

{}

Returns:

Type Description
MSBuildingsDownloader

Configured MSBuildingsDownloader instance

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_downloader(
    self,
    config: MSBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> MSBuildingsDownloader:
    """
    Create and return a MSBuildingsDownloader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional downloader parameters

    Returns:
        Configured MSBuildingsDownloader instance
    """
    return MSBuildingsDownloader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a MSBuildingsReader instance.

Parameters:

Name Type Description Default
config MSBuildingsConfig

The configuration object

required
data_store DataStore

The data store instance to use

required
logger Logger

The logger instance to use

required
**kwargs

Additional reader parameters

{}

Returns:

Type Description
MSBuildingsReader

Configured MSBuildingsReader instance

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def create_reader(
    self,
    config: MSBuildingsConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> MSBuildingsReader:
    """
    Create and return a MSBuildingsReader instance.

    Args:
        config: The configuration object
        data_store: The data store instance to use
        logger: The logger instance to use
        **kwargs: Additional reader parameters

    Returns:
        Configured MSBuildingsReader instance
    """
    return MSBuildingsReader(
        config=config, data_store=data_store, logger=logger, **kwargs
    )

MSBuildingsReader

Bases: BaseHandlerReader

Reader for Microsoft Global Buildings data, supporting country, points, and geometry-based resolution.

Source code in gigaspatial/handlers/microsoft_global_buildings.py
class MSBuildingsReader(BaseHandlerReader):
    """
    Reader for Microsoft Global Buildings data, supporting country, points, and geometry-based resolution.
    """

    def __init__(
        self,
        config: Optional[MSBuildingsConfig] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config or MSBuildingsConfig()
        super().__init__(config=config, data_store=data_store, logger=logger)

    def load_from_paths(
        self, source_data_path: List[Union[str, Path]], **kwargs
    ) -> gpd.GeoDataFrame:
        """
        Load building data from Microsoft Buildings dataset.
        Args:
            source_data_path: List of file paths to load
        Returns:
            GeoDataFrame containing building data
        """
        from gigaspatial.core.io.readers import read_gzipped_json_or_csv
        from shapely.geometry import shape

        def read_ms_dataset(data_store: DataStore, file_path: str):
            df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
            df["geometry"] = df["geometry"].apply(shape)
            return gpd.GeoDataFrame(df, crs=4326)

        result = self._load_tabular_data(
            file_paths=source_data_path, read_function=read_ms_dataset
        )
        return result
load_from_paths(source_data_path, **kwargs)

Load building data from Microsoft Buildings dataset. Args: source_data_path: List of file paths to load Returns: GeoDataFrame containing building data

Source code in gigaspatial/handlers/microsoft_global_buildings.py
def load_from_paths(
    self, source_data_path: List[Union[str, Path]], **kwargs
) -> gpd.GeoDataFrame:
    """
    Load building data from Microsoft Buildings dataset.
    Args:
        source_data_path: List of file paths to load
    Returns:
        GeoDataFrame containing building data
    """
    from gigaspatial.core.io.readers import read_gzipped_json_or_csv
    from shapely.geometry import shape

    def read_ms_dataset(data_store: DataStore, file_path: str):
        df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
        df["geometry"] = df["geometry"].apply(shape)
        return gpd.GeoDataFrame(df, crs=4326)

    result = self._load_tabular_data(
        file_paths=source_data_path, read_function=read_ms_dataset
    )
    return result

opencellid

OpenCellIDConfig

Bases: BaseModel

Configuration for OpenCellID data access

Source code in gigaspatial/handlers/opencellid.py
class OpenCellIDConfig(BaseModel):
    """Configuration for OpenCellID data access"""

    # Base URLs
    BASE_URL: HttpUrl = Field(default="https://opencellid.org/")
    DOWNLOAD_URL: HttpUrl = Field(default="https://opencellid.org/downloads.php?token=")

    # User configuration
    country: str = Field(...)
    api_token: str = Field(
        default=global_config.OPENCELLID_ACCESS_TOKEN,
        description="OpenCellID API Access Token",
    )
    base_path: Path = Field(default=global_config.get_path("opencellid", "bronze"))
    created_newer: int = Field(
        default=2003, description="Filter out cell towers added before this year"
    )
    created_before: int = Field(
        default=datetime.now().year,
        description="Filter out cell towers added after this year",
    )
    drop_duplicates: bool = Field(
        default=True,
        description="Drop cells that are in the exact same location and radio technology",
    )

    @field_validator("country")
    def validate_country(cls, value: str) -> str:
        try:
            return pycountry.countries.lookup(value).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {value}")

    @property
    def output_file_path(self) -> Path:
        """Path to save the downloaded OpenCellID data"""
        return self.base_path / f"opencellid_{self.country.lower()}.csv.gz"

    def __repr__(self) -> str:
        return (
            f"OpenCellIDConfig(\n"
            f"  country='{self.country}'\n"
            f"  created_newer={self.created_newer}\n"
            f"  created_before={self.created_before}\n"
            f"  drop_duplicates={self.drop_duplicates}\n"
            f")"
        )
output_file_path: Path property

Path to save the downloaded OpenCellID data

OpenCellIDDownloader

Downloader for OpenCellID data

Source code in gigaspatial/handlers/opencellid.py
class OpenCellIDDownloader:
    """Downloader for OpenCellID data"""

    def __init__(
        self,
        config: Union[OpenCellIDConfig, dict],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        if isinstance(config, dict):
            self.config = OpenCellIDConfig(**config)
        else:
            self.config = config

        self.data_store = data_store or LocalDataStore()
        self.logger = logger or global_config.get_logger(self.__class__.__name__)

    @classmethod
    def from_country(
        cls,
        country: str,
        api_token: str = global_config.OPENCELLID_ACCESS_TOKEN,
        **kwargs,
    ):
        """Create a downloader for a specific country"""
        config = OpenCellIDConfig(country=country, api_token=api_token, **kwargs)
        return cls(config=config)

    def get_download_links(self) -> List[str]:
        """Get download links for the country from OpenCellID website"""
        url = f"{self.config.DOWNLOAD_URL}{self.config.api_token}"
        country_alpha2 = pycountry.countries.get(
            alpha_3=self.config.country.upper()
        ).alpha_2

        try:
            # Find table with cell tower data links
            self.logger.info(f"Fetching download links for {self.config.country}")
            html_content = requests.get(url).text
            soup = BeautifulSoup(html_content, "lxml")
            table = soup.find("table", {"id": "regions"})

            if not table:
                raise ValueError(
                    "Could not find cell tower data table on OpenCellID website"
                )

            # Parse table headers
            t_headers = []
            for th in table.find_all("th"):
                t_headers.append(th.text.replace("\n", " ").strip())

            # Parse table data
            table_data = []
            for tr in table.tbody.find_all("tr"):
                t_row = {}

                for td, th in zip(tr.find_all("td"), t_headers):
                    if "Files" in th:
                        t_row[th] = []
                        for a in td.find_all("a"):
                            t_row[th].append(a.get("href"))
                    else:
                        t_row[th] = td.text.replace("\n", "").strip()

                table_data.append(t_row)

            cell_dict = pd.DataFrame(table_data)

            # Get links for the country code
            if country_alpha2 not in cell_dict["Country Code"].values:
                raise ValueError(
                    f"Country code {country_alpha2} not found in OpenCellID database"
                )
            else:
                links = cell_dict[cell_dict["Country Code"] == country_alpha2][
                    "Files (grouped by MCC)"
                ].values[0]

            return links

        except Exception as e:
            self.logger.error(f"Error fetching download links: {str(e)}")
            raise

    def download_and_process(self) -> str:
        """Download and process OpenCellID data for the configured country"""

        try:
            links = self.get_download_links()
            self.logger.info(f"Found {len(links)} data files for {self.config.country}")

            dfs = []

            for link in links:
                self.logger.info(f"Downloading data from {link}")
                response = requests.get(link, stream=True)
                response.raise_for_status()

                # Use a temporary file for download
                with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as tmpfile:
                    for chunk in response.iter_content(chunk_size=1024):
                        if chunk:
                            tmpfile.write(chunk)
                    temp_file = tmpfile.name

                try:
                    # Read the downloaded gzipped CSV data
                    with gzip.open(temp_file, "rt") as feed_data:
                        dfs.append(
                            pd.read_csv(
                                feed_data,
                                names=[
                                    "radio",
                                    "mcc",
                                    "net",
                                    "area",
                                    "cell",
                                    "unit",
                                    "lon",
                                    "lat",
                                    "range",
                                    "samples",
                                    "changeable",
                                    "created",
                                    "updated",
                                    "average_signal",
                                ],
                            )
                        )
                except IOError as e:
                    with open(temp_file, "r") as error_file:
                        contents = error_file.readline()

                    if "RATE_LIMITED" in contents:
                        raise RuntimeError(
                            "API rate limit exceeded. You're rate-limited!"
                        )
                    elif "INVALID_TOKEN" in contents:
                        raise RuntimeError("API token rejected by OpenCellID!")
                    else:
                        raise RuntimeError(
                            f"Error processing downloaded data: {str(e)}"
                        )
                finally:
                    # Clean up temporary file
                    if os.path.exists(temp_file):
                        os.remove(temp_file)

            df_cell = pd.concat(dfs, ignore_index=True)

            # Process the data
            if not df_cell.empty:
                # Convert timestamps to datetime
                df_cell["created"] = pd.to_datetime(
                    df_cell["created"], unit="s", origin="unix"
                )
                df_cell["updated"] = pd.to_datetime(
                    df_cell["updated"], unit="s", origin="unix"
                )

                # Filter by year
                df_cell = df_cell[
                    (df_cell.created.dt.year >= self.config.created_newer)
                    & (df_cell.created.dt.year < self.config.created_before)
                ]

                # Drop duplicates if configured
                if self.config.drop_duplicates:
                    df_cell = (
                        df_cell.groupby(["radio", "lon", "lat"]).first().reset_index()
                    )

                # Save processed data using data_store
                output_path = str(self.config.output_file_path)
                self.logger.info(f"Saving processed data to {output_path}")
                with self.data_store.open(output_path, "wb") as f:
                    df_cell.to_csv(f, compression="gzip", index=False)

                return output_path
            else:
                raise ValueError(f"No data found for {self.config.country}")

        except Exception as e:
            self.logger.error(f"Error downloading and processing data: {str(e)}")
            raise
download_and_process()

Download and process OpenCellID data for the configured country

Source code in gigaspatial/handlers/opencellid.py
def download_and_process(self) -> str:
    """Download and process OpenCellID data for the configured country"""

    try:
        links = self.get_download_links()
        self.logger.info(f"Found {len(links)} data files for {self.config.country}")

        dfs = []

        for link in links:
            self.logger.info(f"Downloading data from {link}")
            response = requests.get(link, stream=True)
            response.raise_for_status()

            # Use a temporary file for download
            with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as tmpfile:
                for chunk in response.iter_content(chunk_size=1024):
                    if chunk:
                        tmpfile.write(chunk)
                temp_file = tmpfile.name

            try:
                # Read the downloaded gzipped CSV data
                with gzip.open(temp_file, "rt") as feed_data:
                    dfs.append(
                        pd.read_csv(
                            feed_data,
                            names=[
                                "radio",
                                "mcc",
                                "net",
                                "area",
                                "cell",
                                "unit",
                                "lon",
                                "lat",
                                "range",
                                "samples",
                                "changeable",
                                "created",
                                "updated",
                                "average_signal",
                            ],
                        )
                    )
            except IOError as e:
                with open(temp_file, "r") as error_file:
                    contents = error_file.readline()

                if "RATE_LIMITED" in contents:
                    raise RuntimeError(
                        "API rate limit exceeded. You're rate-limited!"
                    )
                elif "INVALID_TOKEN" in contents:
                    raise RuntimeError("API token rejected by OpenCellID!")
                else:
                    raise RuntimeError(
                        f"Error processing downloaded data: {str(e)}"
                    )
            finally:
                # Clean up temporary file
                if os.path.exists(temp_file):
                    os.remove(temp_file)

        df_cell = pd.concat(dfs, ignore_index=True)

        # Process the data
        if not df_cell.empty:
            # Convert timestamps to datetime
            df_cell["created"] = pd.to_datetime(
                df_cell["created"], unit="s", origin="unix"
            )
            df_cell["updated"] = pd.to_datetime(
                df_cell["updated"], unit="s", origin="unix"
            )

            # Filter by year
            df_cell = df_cell[
                (df_cell.created.dt.year >= self.config.created_newer)
                & (df_cell.created.dt.year < self.config.created_before)
            ]

            # Drop duplicates if configured
            if self.config.drop_duplicates:
                df_cell = (
                    df_cell.groupby(["radio", "lon", "lat"]).first().reset_index()
                )

            # Save processed data using data_store
            output_path = str(self.config.output_file_path)
            self.logger.info(f"Saving processed data to {output_path}")
            with self.data_store.open(output_path, "wb") as f:
                df_cell.to_csv(f, compression="gzip", index=False)

            return output_path
        else:
            raise ValueError(f"No data found for {self.config.country}")

    except Exception as e:
        self.logger.error(f"Error downloading and processing data: {str(e)}")
        raise
from_country(country, api_token=global_config.OPENCELLID_ACCESS_TOKEN, **kwargs) classmethod

Create a downloader for a specific country

Source code in gigaspatial/handlers/opencellid.py
@classmethod
def from_country(
    cls,
    country: str,
    api_token: str = global_config.OPENCELLID_ACCESS_TOKEN,
    **kwargs,
):
    """Create a downloader for a specific country"""
    config = OpenCellIDConfig(country=country, api_token=api_token, **kwargs)
    return cls(config=config)

Get download links for the country from OpenCellID website

Source code in gigaspatial/handlers/opencellid.py
def get_download_links(self) -> List[str]:
    """Get download links for the country from OpenCellID website"""
    url = f"{self.config.DOWNLOAD_URL}{self.config.api_token}"
    country_alpha2 = pycountry.countries.get(
        alpha_3=self.config.country.upper()
    ).alpha_2

    try:
        # Find table with cell tower data links
        self.logger.info(f"Fetching download links for {self.config.country}")
        html_content = requests.get(url).text
        soup = BeautifulSoup(html_content, "lxml")
        table = soup.find("table", {"id": "regions"})

        if not table:
            raise ValueError(
                "Could not find cell tower data table on OpenCellID website"
            )

        # Parse table headers
        t_headers = []
        for th in table.find_all("th"):
            t_headers.append(th.text.replace("\n", " ").strip())

        # Parse table data
        table_data = []
        for tr in table.tbody.find_all("tr"):
            t_row = {}

            for td, th in zip(tr.find_all("td"), t_headers):
                if "Files" in th:
                    t_row[th] = []
                    for a in td.find_all("a"):
                        t_row[th].append(a.get("href"))
                else:
                    t_row[th] = td.text.replace("\n", "").strip()

            table_data.append(t_row)

        cell_dict = pd.DataFrame(table_data)

        # Get links for the country code
        if country_alpha2 not in cell_dict["Country Code"].values:
            raise ValueError(
                f"Country code {country_alpha2} not found in OpenCellID database"
            )
        else:
            links = cell_dict[cell_dict["Country Code"] == country_alpha2][
                "Files (grouped by MCC)"
            ].values[0]

        return links

    except Exception as e:
        self.logger.error(f"Error fetching download links: {str(e)}")
        raise

OpenCellIDReader

Reader for OpenCellID data

Source code in gigaspatial/handlers/opencellid.py
class OpenCellIDReader:
    """Reader for OpenCellID data"""

    def __init__(
        self,
        country: str,
        data_store: Optional[DataStore] = None,
        base_path: Optional[Path] = None,
    ):
        self.country = pycountry.countries.lookup(country).alpha_3
        self.data_store = data_store or LocalDataStore()
        self.base_path = base_path or global_config.get_path("opencellid", "bronze")

    def read_data(self) -> pd.DataFrame:
        """Read OpenCellID data for the specified country"""
        file_path = str(self.base_path / f"opencellid_{self.country.lower()}.csv.gz")

        if not self.data_store.file_exists(file_path):
            raise FileNotFoundError(
                f"OpenCellID data for {self.country} not found at {file_path}. "
                "Download the data first using OpenCellIDDownloader."
            )

        return read_dataset(self.data_store, file_path)

    def to_geodataframe(self) -> gpd.GeoDataFrame:
        """Convert OpenCellID data to a GeoDataFrame"""
        df = self.read_data()
        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326"
        )
        return gdf
read_data()

Read OpenCellID data for the specified country

Source code in gigaspatial/handlers/opencellid.py
def read_data(self) -> pd.DataFrame:
    """Read OpenCellID data for the specified country"""
    file_path = str(self.base_path / f"opencellid_{self.country.lower()}.csv.gz")

    if not self.data_store.file_exists(file_path):
        raise FileNotFoundError(
            f"OpenCellID data for {self.country} not found at {file_path}. "
            "Download the data first using OpenCellIDDownloader."
        )

    return read_dataset(self.data_store, file_path)
to_geodataframe()

Convert OpenCellID data to a GeoDataFrame

Source code in gigaspatial/handlers/opencellid.py
def to_geodataframe(self) -> gpd.GeoDataFrame:
    """Convert OpenCellID data to a GeoDataFrame"""
    df = self.read_data()
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df.lon, df.lat), crs="EPSG:4326"
    )
    return gdf

osm

OSMLocationFetcher dataclass

A class to fetch and process location data from OpenStreetMap using the Overpass API.

This class supports fetching various OSM location types including amenities, buildings, shops, and other POI categories.

Source code in gigaspatial/handlers/osm.py
@dataclass
class OSMLocationFetcher:
    """
    A class to fetch and process location data from OpenStreetMap using the Overpass API.

    This class supports fetching various OSM location types including amenities, buildings,
    shops, and other POI categories.
    """

    country: str
    location_types: Union[List[str], Dict[str, List[str]]]
    base_url: str = "http://overpass-api.de/api/interpreter"
    timeout: int = 600
    max_retries: int = 3
    retry_delay: int = 5

    def __post_init__(self):
        """Validate inputs, normalize location_types, and set up logging."""
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_2
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        # Normalize location_types to always be a dictionary
        if isinstance(self.location_types, list):
            self.location_types = {"amenity": self.location_types}
        elif not isinstance(self.location_types, dict):
            raise TypeError(
                "location_types must be a list of strings or a dictionary mapping categories to type lists"
            )

        self.logger = config.get_logger(self.__class__.__name__)

    def _build_queries(self, since_year: Optional[int] = None) -> List[str]:
        """
        Construct separate Overpass QL queries for different element types and categories.
        Returns list of [nodes_relations_query, ways_query]
        """
        if since_year:
            date_filter = f'(newer:"{since_year}-01-01T00:00:00Z")'
        else:
            date_filter = ""

        # Query for nodes and relations (with center output)
        nodes_relations_queries = []
        for category, types in self.location_types.items():
            nodes_relations_queries.extend(
                [
                    f"""node["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
                    f"""relation["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);""",
                ]
            )

        nodes_relations_queries = "\n".join(nodes_relations_queries)

        nodes_relations_query = f"""
        [out:json][timeout:{self.timeout}];
        area["ISO3166-1"={self.country}]->.searchArea;
        (
            {nodes_relations_queries}
        );
        out center;
        """

        # Query for ways (with geometry output)
        ways_queries = []
        for category, types in self.location_types.items():
            ways_queries.append(
                f"""way["{category}"~"^({"|".join(types)})"]{date_filter}(area.searchArea);"""
            )

        ways_queries = "\n".join(ways_queries)

        ways_query = f"""
        [out:json][timeout:{self.timeout}];
        area["ISO3166-1"={self.country}]->.searchArea;
        (
            {ways_queries}
        );
        out geom;
        """

        return [nodes_relations_query, ways_query]

    def _make_request(self, query: str) -> Dict:
        """Make HTTP request to Overpass API with retry mechanism."""
        for attempt in range(self.max_retries):
            try:
                self.logger.debug(f"Executing query:\n{query}")
                response = requests.get(
                    self.base_url, params={"data": query}, timeout=self.timeout
                )
                response.raise_for_status()
                return response.json()
            except RequestException as e:
                self.logger.warning(f"Attempt {attempt + 1} failed: {str(e)}")
                if attempt < self.max_retries - 1:
                    sleep(self.retry_delay)
                else:
                    raise RuntimeError(
                        f"Failed to fetch data after {self.max_retries} attempts"
                    ) from e

    def _extract_matching_categories(self, tags: Dict[str, str]) -> Dict[str, str]:
        """
        Extract all matching categories and their values from the tags.
        Returns:
            Dict mapping each matching category to its value
        """
        matches = {}
        for category, types in self.location_types.items():
            if category in tags and tags[category] in types:
                matches[category] = tags[category]
        return matches

    def _process_node_relation(self, element: Dict) -> List[Dict[str, any]]:
        """
        Process a node or relation element.
        May return multiple processed elements if the element matches multiple categories.
        """
        try:
            tags = element.get("tags", {})
            matching_categories = self._extract_matching_categories(tags)

            if not matching_categories:
                self.logger.warning(
                    f"Element {element['id']} missing or not matching specified category tags"
                )
                return []

            _lat = element.get("lat") or element["center"]["lat"]
            _lon = element.get("lon") or element["center"]["lon"]
            point_geom = Point(_lon, _lat)

            # for each matching category, create a separate element
            results = []
            for category, value in matching_categories.items():
                results.append(
                    {
                        "source_id": element["id"],
                        "category": category,
                        "category_value": value,
                        "name": tags.get("name", ""),
                        "name_en": tags.get("name:en", ""),
                        "type": element["type"],
                        "geometry": point_geom,
                        "latitude": _lat,
                        "longitude": _lon,
                        "matching_categories": list(matching_categories.keys()),
                    }
                )

            return results

        except KeyError as e:
            self.logger.error(f"Corrupt data received for node/relation: {str(e)}")
            return []

    def _process_way(self, element: Dict) -> List[Dict[str, any]]:
        """
        Process a way element with geometry.
        May return multiple processed elements if the element matches multiple categories.
        """
        try:
            tags = element.get("tags", {})
            matching_categories = self._extract_matching_categories(tags)

            if not matching_categories:
                self.logger.warning(
                    f"Element {element['id']} missing or not matching specified category tags"
                )
                return []

            # Create polygon from geometry points
            polygon = Polygon([(p["lon"], p["lat"]) for p in element["geometry"]])
            centroid = polygon.centroid

            # For each matching category, create a separate element
            results = []
            for category, value in matching_categories.items():
                results.append(
                    {
                        "source_id": element["id"],
                        "category": category,
                        "category_value": value,
                        "name": tags.get("name", ""),
                        "name_en": tags.get("name:en", ""),
                        "type": element["type"],
                        "geometry": polygon,
                        "latitude": centroid.y,
                        "longitude": centroid.x,
                        "matching_categories": list(matching_categories.keys()),
                    }
                )

            return results
        except (KeyError, ValueError) as e:
            self.logger.error(f"Error processing way geometry: {str(e)}")
            return []

    def fetch_locations(
        self,
        since_year: Optional[int] = None,
        handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
    ) -> pd.DataFrame:
        """
        Fetch and process OSM locations.

        Args:
            since_year (int, optional): Filter for locations added/modified since this year.
            handle_duplicates (str): How to handle objects matching multiple categories:
                - 'separate': Create separate entries for each category (default)
                - 'combine': Use a single entry with a list of matching categories
                - 'primary': Keep only the first matching category

        Returns:
            pd.DataFrame: Processed OSM locations
        """
        if handle_duplicates not in ("separate", "combine", "primary"):
            raise ValueError(
                "handle_duplicates must be one of: 'separate', 'combine', 'primary'"
            )

        self.logger.info(
            f"Fetching OSM locations from Overpass API for country: {self.country}"
        )
        self.logger.info(f"Location types: {self.location_types}")
        self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")

        # Get queries for different element types
        nodes_relations_query, ways_query = self._build_queries(since_year)

        # Fetch nodes and relations
        nodes_relations_response = self._make_request(nodes_relations_query)
        nodes_relations = nodes_relations_response.get("elements", [])

        # Fetch ways
        ways_response = self._make_request(ways_query)
        ways = ways_response.get("elements", [])

        if not nodes_relations and not ways:
            self.logger.warning("No locations found for the specified criteria")
            return pd.DataFrame()

        self.logger.info(
            f"Processing {len(nodes_relations)} nodes/relations and {len(ways)} ways..."
        )

        # Process nodes and relations
        with ThreadPoolExecutor() as executor:
            processed_nodes_relations = [
                item
                for sublist in executor.map(
                    self._process_node_relation, nodes_relations
                )
                for item in sublist
            ]

        # Process ways
        with ThreadPoolExecutor() as executor:
            processed_ways = [
                item
                for sublist in executor.map(self._process_way, ways)
                for item in sublist
            ]

        # Combine all processed elements
        all_elements = processed_nodes_relations + processed_ways

        if not all_elements:
            self.logger.warning("No matching elements found after processing")
            return pd.DataFrame()

        # Handle duplicates based on the specified strategy
        if handle_duplicates != "separate":
            # Group by source_id
            grouped_elements = {}
            for elem in all_elements:
                source_id = elem["source_id"]
                if source_id not in grouped_elements:
                    grouped_elements[source_id] = elem
                elif handle_duplicates == "combine":
                    # Combine matching categories
                    if grouped_elements[source_id]["category"] != elem["category"]:
                        if isinstance(grouped_elements[source_id]["category"], str):
                            grouped_elements[source_id]["category"] = [
                                grouped_elements[source_id]["category"]
                            ]
                            grouped_elements[source_id]["category_value"] = [
                                grouped_elements[source_id]["category_value"]
                            ]

                        if (
                            elem["category"]
                            not in grouped_elements[source_id]["category"]
                        ):
                            grouped_elements[source_id]["category"].append(
                                elem["category"]
                            )
                            grouped_elements[source_id]["category_value"].append(
                                elem["category_value"]
                            )
                # For 'primary', just keep the first one we encountered

            all_elements = list(grouped_elements.values())

        locations = pd.DataFrame(all_elements)

        # Log element type distribution
        type_counts = locations["type"].value_counts()
        self.logger.info("\nElement type distribution:")
        for element_type, count in type_counts.items():
            self.logger.info(f"{element_type}: {count}")

        # Log category distribution
        if handle_duplicates == "combine":
            # Count each category separately when they're in lists
            category_counts = {}
            for cats in locations["category"]:
                if isinstance(cats, list):
                    for cat in cats:
                        category_counts[cat] = category_counts.get(cat, 0) + 1
                else:
                    category_counts[cats] = category_counts.get(cats, 0) + 1

            self.logger.info("\nCategory distribution:")
            for category, count in category_counts.items():
                self.logger.info(f"{category}: {count}")
        else:
            category_counts = locations["category"].value_counts()
            self.logger.info("\nCategory distribution:")
            for category, count in category_counts.items():
                self.logger.info(f"{category}: {count}")

        # Log elements with multiple matching categories
        multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
        if multi_category:
            self.logger.info(
                f"\n{len(multi_category)} elements matched multiple categories"
            )

        self.logger.info(f"Successfully processed {len(locations)} locations")
        return locations
__post_init__()

Validate inputs, normalize location_types, and set up logging.

Source code in gigaspatial/handlers/osm.py
def __post_init__(self):
    """Validate inputs, normalize location_types, and set up logging."""
    try:
        self.country = pycountry.countries.lookup(self.country).alpha_2
    except LookupError:
        raise ValueError(f"Invalid country code provided: {self.country}")

    # Normalize location_types to always be a dictionary
    if isinstance(self.location_types, list):
        self.location_types = {"amenity": self.location_types}
    elif not isinstance(self.location_types, dict):
        raise TypeError(
            "location_types must be a list of strings or a dictionary mapping categories to type lists"
        )

    self.logger = config.get_logger(self.__class__.__name__)
fetch_locations(since_year=None, handle_duplicates='separate')

Fetch and process OSM locations.

Parameters:

Name Type Description Default
since_year int

Filter for locations added/modified since this year.

None
handle_duplicates str

How to handle objects matching multiple categories: - 'separate': Create separate entries for each category (default) - 'combine': Use a single entry with a list of matching categories - 'primary': Keep only the first matching category

'separate'

Returns:

Type Description
DataFrame

pd.DataFrame: Processed OSM locations

Source code in gigaspatial/handlers/osm.py
def fetch_locations(
    self,
    since_year: Optional[int] = None,
    handle_duplicates: Literal["separate", "combine", "primary"] = "separate",
) -> pd.DataFrame:
    """
    Fetch and process OSM locations.

    Args:
        since_year (int, optional): Filter for locations added/modified since this year.
        handle_duplicates (str): How to handle objects matching multiple categories:
            - 'separate': Create separate entries for each category (default)
            - 'combine': Use a single entry with a list of matching categories
            - 'primary': Keep only the first matching category

    Returns:
        pd.DataFrame: Processed OSM locations
    """
    if handle_duplicates not in ("separate", "combine", "primary"):
        raise ValueError(
            "handle_duplicates must be one of: 'separate', 'combine', 'primary'"
        )

    self.logger.info(
        f"Fetching OSM locations from Overpass API for country: {self.country}"
    )
    self.logger.info(f"Location types: {self.location_types}")
    self.logger.info(f"Handling duplicate category matches as: {handle_duplicates}")

    # Get queries for different element types
    nodes_relations_query, ways_query = self._build_queries(since_year)

    # Fetch nodes and relations
    nodes_relations_response = self._make_request(nodes_relations_query)
    nodes_relations = nodes_relations_response.get("elements", [])

    # Fetch ways
    ways_response = self._make_request(ways_query)
    ways = ways_response.get("elements", [])

    if not nodes_relations and not ways:
        self.logger.warning("No locations found for the specified criteria")
        return pd.DataFrame()

    self.logger.info(
        f"Processing {len(nodes_relations)} nodes/relations and {len(ways)} ways..."
    )

    # Process nodes and relations
    with ThreadPoolExecutor() as executor:
        processed_nodes_relations = [
            item
            for sublist in executor.map(
                self._process_node_relation, nodes_relations
            )
            for item in sublist
        ]

    # Process ways
    with ThreadPoolExecutor() as executor:
        processed_ways = [
            item
            for sublist in executor.map(self._process_way, ways)
            for item in sublist
        ]

    # Combine all processed elements
    all_elements = processed_nodes_relations + processed_ways

    if not all_elements:
        self.logger.warning("No matching elements found after processing")
        return pd.DataFrame()

    # Handle duplicates based on the specified strategy
    if handle_duplicates != "separate":
        # Group by source_id
        grouped_elements = {}
        for elem in all_elements:
            source_id = elem["source_id"]
            if source_id not in grouped_elements:
                grouped_elements[source_id] = elem
            elif handle_duplicates == "combine":
                # Combine matching categories
                if grouped_elements[source_id]["category"] != elem["category"]:
                    if isinstance(grouped_elements[source_id]["category"], str):
                        grouped_elements[source_id]["category"] = [
                            grouped_elements[source_id]["category"]
                        ]
                        grouped_elements[source_id]["category_value"] = [
                            grouped_elements[source_id]["category_value"]
                        ]

                    if (
                        elem["category"]
                        not in grouped_elements[source_id]["category"]
                    ):
                        grouped_elements[source_id]["category"].append(
                            elem["category"]
                        )
                        grouped_elements[source_id]["category_value"].append(
                            elem["category_value"]
                        )
            # For 'primary', just keep the first one we encountered

        all_elements = list(grouped_elements.values())

    locations = pd.DataFrame(all_elements)

    # Log element type distribution
    type_counts = locations["type"].value_counts()
    self.logger.info("\nElement type distribution:")
    for element_type, count in type_counts.items():
        self.logger.info(f"{element_type}: {count}")

    # Log category distribution
    if handle_duplicates == "combine":
        # Count each category separately when they're in lists
        category_counts = {}
        for cats in locations["category"]:
            if isinstance(cats, list):
                for cat in cats:
                    category_counts[cat] = category_counts.get(cat, 0) + 1
            else:
                category_counts[cats] = category_counts.get(cats, 0) + 1

        self.logger.info("\nCategory distribution:")
        for category, count in category_counts.items():
            self.logger.info(f"{category}: {count}")
    else:
        category_counts = locations["category"].value_counts()
        self.logger.info("\nCategory distribution:")
        for category, count in category_counts.items():
            self.logger.info(f"{category}: {count}")

    # Log elements with multiple matching categories
    multi_category = [e for e in all_elements if len(e["matching_categories"]) > 1]
    if multi_category:
        self.logger.info(
            f"\n{len(multi_category)} elements matched multiple categories"
        )

    self.logger.info(f"Successfully processed {len(locations)} locations")
    return locations

overture

OvertureAmenityFetcher

A class to fetch and process amenity locations from Overture.

Source code in gigaspatial/handlers/overture.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class OvertureAmenityFetcher:
    """
    A class to fetch and process amenity locations from Overture.
    """

    # constants
    release: Optional[str] = "2024-12-18.0"
    base_url: Optional[str] = (
        "s3://overturemaps-us-west-2/release/{release}/theme=places/*/*"
    )

    # user config
    country: str = Field(...)
    amenity_types: List[str] = Field(..., description="List of amenity types to fetch")
    geom: Union[Polygon, MultiPolygon] = None

    # config for country boundary access from data storage
    # if None GADM boundaries will be used
    data_store: DataStore = None
    country_geom_path: Optional[Union[str, Path]] = None

    def __post_init__(self):
        """Validate inputs and set up logging."""
        try:
            self.country = pycountry.countries.lookup(self.country).alpha_2
        except LookupError:
            raise ValueError(f"Invalid country code provided: {self.country}")

        self.base_url = self.base_url.format(release=self.release)
        self.logger = config.get_logger(self.__class__.__name__)

        self.connection = self._set_connection()

    def _set_connection(self):
        """Set the connection to the DB"""
        db = duckdb.connect()
        db.install_extension("spatial")
        db.load_extension("spatial")
        return db

    def _load_country_geometry(
        self,
    ) -> Union[Polygon, MultiPolygon]:
        """Load country boundary geometry from DataStore or GADM."""

        gdf_admin0 = AdminBoundaries.create(
            country_code=pycountry.countries.lookup(self.country).alpha_3,
            admin_level=0,
            data_store=self.data_store,
            path=self.country_geom_path,
        ).to_geodataframe()

        return gdf_admin0.geometry.iloc[0]

    def _build_query(self, match_pattern: bool = False, **kwargs) -> str:
        """Constructs and returns the query"""

        if match_pattern:
            amenity_query = " OR ".join(
                [f"category ilike '%{amenity}%'" for amenity in self.amenity_types]
            )
        else:
            amenity_query = " OR ".join(
                [f"category == '{amenity}'" for amenity in self.amenity_types]
            )

        query = """
        SELECT id,
            names.primary AS name,
            ROUND(confidence,2) as confidence,
            categories.primary AS category,
            ST_AsText(geometry) as geometry,
        FROM read_parquet('s3://overturemaps-us-west-2/release/2024-12-18.0/theme=places/type=place/*',
            hive_partitioning=1)
        WHERE bbox.xmin > {}
            AND bbox.ymin > {} 
            AND bbox.xmax <  {}
            AND bbox.ymax < {}
            AND ({})
        """

        if not self.geom:
            self.geom = self._load_country_geometry()

        return query.format(*self.geom.bounds, amenity_query)

    def fetch_locations(
        self, match_pattern: bool = False, **kwargs
    ) -> gpd.GeoDataFrame:
        """Fetch and process amenity locations."""
        self.logger.info("Fetching amenity locations from Overture DB...")

        query = self._build_query(match_pattern=match_pattern, **kwargs)

        df = self.connection.execute(query).df()

        self.logger.info("Processing geometries")
        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="EPSG:4326"
        )

        # filter by geometry boundary
        s = STRtree(gdf.geometry)
        result = s.query(self.geom, predicate="intersects")

        locations = gdf.iloc[result].reset_index(drop=True)

        self.logger.info(f"Successfully processed {len(locations)} amenity locations")
        return locations
__post_init__()

Validate inputs and set up logging.

Source code in gigaspatial/handlers/overture.py
def __post_init__(self):
    """Validate inputs and set up logging."""
    try:
        self.country = pycountry.countries.lookup(self.country).alpha_2
    except LookupError:
        raise ValueError(f"Invalid country code provided: {self.country}")

    self.base_url = self.base_url.format(release=self.release)
    self.logger = config.get_logger(self.__class__.__name__)

    self.connection = self._set_connection()
fetch_locations(match_pattern=False, **kwargs)

Fetch and process amenity locations.

Source code in gigaspatial/handlers/overture.py
def fetch_locations(
    self, match_pattern: bool = False, **kwargs
) -> gpd.GeoDataFrame:
    """Fetch and process amenity locations."""
    self.logger.info("Fetching amenity locations from Overture DB...")

    query = self._build_query(match_pattern=match_pattern, **kwargs)

    df = self.connection.execute(query).df()

    self.logger.info("Processing geometries")
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.GeoSeries.from_wkt(df["geometry"]), crs="EPSG:4326"
    )

    # filter by geometry boundary
    s = STRtree(gdf.geometry)
    result = s.query(self.geom, predicate="intersects")

    locations = gdf.iloc[result].reset_index(drop=True)

    self.logger.info(f"Successfully processed {len(locations)} amenity locations")
    return locations

rwi

RWIConfig dataclass

Bases: HDXConfig

Configuration for Relative Wealth Index data access

Source code in gigaspatial/handlers/rwi.py
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class RWIConfig(HDXConfig):
    """Configuration for Relative Wealth Index data access"""

    # Override dataset_name to be fixed for RWI
    dataset_name: Literal["relative-wealth-index"] = Field(
        default="relative-wealth-index"
    )

    # Additional RWI-specific configurations
    country: Optional[str] = Field(
        default=None, description="Country ISO code to filter data for"
    )
    latest_only: bool = Field(
        default=True,
        description="If True, only get the latest resource for each country",
    )

    def __post_init__(self):
        super().__post_init__()

    def get_relevant_data_units_by_country(
        self, country: str, **kwargs
    ) -> List[Resource]:
        """Get relevant data units for a country, optionally filtering for latest version"""
        country = pycountry.countries.lookup(country)
        values = [country.alpha_3]
        resources = self.get_dataset_resources(
            filter={"url": values},
        )

        if self.latest_only and len(resources) > 1:
            # Find the resource with the latest creation date
            latest_resource = None
            latest_date = None

            for resource in resources:
                created = resource.get("created")
                if created:
                    try:
                        created_dt = datetime.fromisoformat(
                            created.replace("Z", "+00:00")
                        )
                        if latest_date is None or created_dt > latest_date:
                            latest_date = created_dt
                            latest_resource = resource
                    except ValueError:
                        self.logger.warning(
                            f"Could not parse creation date for resource: {created}"
                        )

            if latest_resource:
                resources = [latest_resource]

        return resources
get_relevant_data_units_by_country(country, **kwargs)

Get relevant data units for a country, optionally filtering for latest version

Source code in gigaspatial/handlers/rwi.py
def get_relevant_data_units_by_country(
    self, country: str, **kwargs
) -> List[Resource]:
    """Get relevant data units for a country, optionally filtering for latest version"""
    country = pycountry.countries.lookup(country)
    values = [country.alpha_3]
    resources = self.get_dataset_resources(
        filter={"url": values},
    )

    if self.latest_only and len(resources) > 1:
        # Find the resource with the latest creation date
        latest_resource = None
        latest_date = None

        for resource in resources:
            created = resource.get("created")
            if created:
                try:
                    created_dt = datetime.fromisoformat(
                        created.replace("Z", "+00:00")
                    )
                    if latest_date is None or created_dt > latest_date:
                        latest_date = created_dt
                        latest_resource = resource
                except ValueError:
                    self.logger.warning(
                        f"Could not parse creation date for resource: {created}"
                    )

        if latest_resource:
            resources = [latest_resource]

    return resources

RWIDownloader

Bases: HDXDownloader

Specialized downloader for the Relative Wealth Index dataset from HDX

Source code in gigaspatial/handlers/rwi.py
class RWIDownloader(HDXDownloader):
    """Specialized downloader for the Relative Wealth Index dataset from HDX"""

    def __init__(
        self,
        config: Union[RWIConfig, dict] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

RWIHandler

Bases: HDXHandler

Handler for Relative Wealth Index dataset

Source code in gigaspatial/handlers/rwi.py
class RWIHandler(HDXHandler):
    """Handler for Relative Wealth Index dataset"""

    def __init__(
        self,
        config: Optional[RWIConfig] = None,
        downloader: Optional[RWIDownloader] = None,
        reader: Optional[RWIReader] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
        **kwargs,
    ):
        super().__init__(
            dataset_name="relative-wealth-index",
            config=config,
            downloader=downloader,
            reader=reader,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_config(
        self, data_store: DataStore, logger: logging.Logger, **kwargs
    ) -> RWIConfig:
        """Create and return a RWIConfig instance"""
        return RWIConfig(
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_downloader(
        self,
        config: RWIConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> RWIDownloader:
        """Create and return a RWIDownloader instance"""
        return RWIDownloader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )

    def create_reader(
        self,
        config: RWIConfig,
        data_store: DataStore,
        logger: logging.Logger,
        **kwargs,
    ) -> RWIReader:
        """Create and return a RWIReader instance"""
        return RWIReader(
            config=config,
            data_store=data_store,
            logger=logger,
            **kwargs,
        )
create_config(data_store, logger, **kwargs)

Create and return a RWIConfig instance

Source code in gigaspatial/handlers/rwi.py
def create_config(
    self, data_store: DataStore, logger: logging.Logger, **kwargs
) -> RWIConfig:
    """Create and return a RWIConfig instance"""
    return RWIConfig(
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_downloader(config, data_store, logger, **kwargs)

Create and return a RWIDownloader instance

Source code in gigaspatial/handlers/rwi.py
def create_downloader(
    self,
    config: RWIConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> RWIDownloader:
    """Create and return a RWIDownloader instance"""
    return RWIDownloader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )
create_reader(config, data_store, logger, **kwargs)

Create and return a RWIReader instance

Source code in gigaspatial/handlers/rwi.py
def create_reader(
    self,
    config: RWIConfig,
    data_store: DataStore,
    logger: logging.Logger,
    **kwargs,
) -> RWIReader:
    """Create and return a RWIReader instance"""
    return RWIReader(
        config=config,
        data_store=data_store,
        logger=logger,
        **kwargs,
    )

RWIReader

Bases: HDXReader

Specialized reader for the Relative Wealth Index dataset from HDX

Source code in gigaspatial/handlers/rwi.py
class RWIReader(HDXReader):
    """Specialized reader for the Relative Wealth Index dataset from HDX"""

    def __init__(
        self,
        config: Union[RWIConfig, dict] = None,
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        config = config if isinstance(config, RWIConfig) else RWIConfig(**config)
        super().__init__(config=config, data_store=data_store, logger=logger)

unicef_georepo

GeoRepoClient

A client for interacting with the GeoRepo API.

GeoRepo is a platform for managing and accessing geospatial administrative boundary data. This client provides methods to search, retrieve, and work with modules, datasets, views, and administrative entities.

Attributes:

Name Type Description
base_url str

The base URL for the GeoRepo API

api_key str

The API key for authentication

email str

The email address associated with the API key

headers dict

HTTP headers used for API requests

Source code in gigaspatial/handlers/unicef_georepo.py
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
class GeoRepoClient:
    """
    A client for interacting with the GeoRepo API.

    GeoRepo is a platform for managing and accessing geospatial administrative
    boundary data. This client provides methods to search, retrieve, and work
    with modules, datasets, views, and administrative entities.

    Attributes:
        base_url (str): The base URL for the GeoRepo API
        api_key (str): The API key for authentication
        email (str): The email address associated with the API key
        headers (dict): HTTP headers used for API requests
    """

    def __init__(self, api_key=None, email=None):
        """
        Initialize the GeoRepo client.

        Args:
            api_key (str, optional): GeoRepo API key. If not provided, will use
                the GEOREPO_API_KEY environment variable from config.
            email (str, optional): Email address associated with the API key.
                If not provided, will use the GEOREPO_USER_EMAIL environment
                variable from config.

        Raises:
            ValueError: If api_key or email is not provided and cannot be found
                in environment variables.
        """
        self.base_url = "https://georepo.unicef.org/api/v1"
        self.api_key = api_key or config.GEOREPO_API_KEY
        self.email = email or config.GEOREPO_USER_EMAIL
        self.logger = config.get_logger(self.__class__.__name__)

        if not self.api_key:
            raise ValueError(
                "API Key is required. Provide it as a parameter or set GEOREPO_API_KEY environment variable."
            )

        if not self.email:
            raise ValueError(
                "Email is required. Provide it as a parameter or set GEOREPO_USER_EMAIL environment variable."
            )

        self.headers = {
            "Accept": "application/json",
            "Authorization": f"Token {self.api_key}",
            "GeoRepo-User-Key": self.email,
        }

    def _make_request(self, method, endpoint, params=None, data=None):
        """Internal method to handle making HTTP requests."""
        try:
            response = requests.request(
                method, endpoint, headers=self.headers, params=params, json=data
            )
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            raise requests.exceptions.HTTPError(f"API request failed: {e}")

    def check_connection(self):
        """
        Checks if the API connection is valid by making a simple request.

        Returns:
            bool: True if the connection is valid, False otherwise.
        """
        endpoint = f"{self.base_url}/search/module/list/"
        try:
            self._make_request("GET", endpoint)
            return True
        except requests.exceptions.HTTPError as e:
            return False
        except requests.exceptions.RequestException as e:
            raise requests.exceptions.RequestException(
                f"Connection check encountered a network error: {e}"
            )

    def list_modules(self):
        """
        List all available modules in GeoRepo.

        A module is a top-level organizational unit that contains datasets.
        Examples include "Admin Boundaries", "Health Facilities", etc.

        Returns:
            dict: JSON response containing a list of modules with their metadata.
                Each module includes 'uuid', 'name', 'description', and other properties.

        Raises:
            requests.HTTPError: If the API request fails.
        """
        endpoint = f"{self.base_url}/search/module/list/"
        response = self._make_request("GET", endpoint)
        return response.json()

    def list_datasets_by_module(self, module_uuid):
        """
        List all datasets within a specific module.

        A dataset represents a collection of related geographic entities,
        such as administrative boundaries for a specific country or region.

        Args:
            module_uuid (str): The UUID of the module to query.

        Returns:
            dict: JSON response containing a list of datasets with their metadata.
                Each dataset includes 'uuid', 'name', 'description', creation date, etc.

        Raises:
            requests.HTTPError: If the API request fails or module_uuid is invalid.
        """
        endpoint = f"{self.base_url}/search/module/{module_uuid}/dataset/list/"
        response = self._make_request("GET", endpoint)
        return response.json()

    def get_dataset_details(self, dataset_uuid):
        """
        Get detailed information about a specific dataset.

        This includes metadata about the dataset and information about
        available administrative levels (e.g., country, province, district).

        Args:
            dataset_uuid (str): The UUID of the dataset to query.

        Returns:
            dict: JSON response containing dataset details including:
                - Basic metadata (name, description, etc.)
                - Available administrative levels and their properties
                - Temporal information and data sources

        Raises:
            requests.HTTPError: If the API request fails or dataset_uuid is invalid.
        """
        endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/"
        response = self._make_request("GET", endpoint)
        return response.json()

    def list_views_by_dataset(self, dataset_uuid, page=1, page_size=50):
        """
        List views for a dataset with pagination support.

        A view represents a specific version or subset of a dataset.
        Views may be tagged as 'latest' or represent different time periods.

        Args:
            dataset_uuid (str): The UUID of the dataset to query.
            page (int, optional): Page number for pagination. Defaults to 1.
            page_size (int, optional): Number of results per page. Defaults to 50.

        Returns:
            dict: JSON response containing paginated list of views with metadata.
                Includes 'results', 'total_page', 'current_page', and 'count' fields.
                Each view includes 'uuid', 'name', 'tags', and other properties.

        Raises:
            requests.HTTPError: If the API request fails or dataset_uuid is invalid.
        """
        endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/view/list/"
        params = {"page": page, "page_size": page_size}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def list_entities_by_admin_level(
        self,
        view_uuid,
        admin_level,
        geom="no_geom",
        format="json",
        page=1,
        page_size=50,
    ):
        """
        List entities at a specific administrative level within a view.

        Administrative levels typically follow a hierarchy:
        - Level 0: Countries
        - Level 1: States/Provinces/Regions
        - Level 2: Districts/Counties
        - Level 3: Sub-districts/Municipalities
        - And so on...

        Args:
            view_uuid (str): The UUID of the view to query.
            admin_level (int): The administrative level to retrieve (0, 1, 2, etc.).
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "no_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "json".
            page (int, optional): Page number for pagination. Defaults to 1.
            page_size (int, optional): Number of results per page. Defaults to 50.

        Returns:
            tuple: A tuple containing:
                - dict: JSON/GeoJSON response with entity data
                - dict: Metadata with pagination info (page, total_page, total_count)

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        endpoint = (
            f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
        )
        params = {"page": page, "page_size": page_size, "geom": geom, "format": format}
        response = self._make_request("GET", endpoint, params=params)

        metadata = {
            "page": int(response.headers.get("page", 1)),
            "total_page": int(response.headers.get("total_page", 1)),
            "total_count": int(response.headers.get("count", 0)),
        }

        return response.json(), metadata

    def get_entity_by_ucode(self, ucode, geom="full_geom", format="geojson"):
        """
        Get detailed information about a specific entity using its Ucode.

        A Ucode (Universal Code) is a unique identifier for geographic entities
        within the GeoRepo system, typically in the format "ISO3_LEVEL_NAME".

        Args:
            ucode (str): The unique code identifier for the entity.
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "full_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "geojson".

        Returns:
            dict: JSON/GeoJSON response containing entity details including
                geometry, properties, administrative level, and metadata.

        Raises:
            requests.HTTPError: If the API request fails or ucode is invalid.
        """
        endpoint = f"{self.base_url}/search/entity/ucode/{ucode}/"
        params = {"geom": geom, "format": format}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def list_entity_children(
        self, view_uuid, entity_ucode, geom="no_geom", format="json"
    ):
        """
        List direct children of an entity in the administrative hierarchy.

        For example, if given a country entity, this will return its states/provinces.
        If given a state entity, this will return its districts/counties.

        Args:
            view_uuid (str): The UUID of the view containing the entity.
            entity_ucode (str): The Ucode of the parent entity.
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "no_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "json".

        Returns:
            dict: JSON/GeoJSON response containing list of child entities
                with their properties and optional geometry data.

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        endpoint = (
            f"{self.base_url}/search/view/{view_uuid}/entity/{entity_ucode}/children/"
        )
        params = {"geom": geom, "format": format}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def search_entities_by_name(self, view_uuid, name, page=1, page_size=50):
        """
        Search for entities by name using fuzzy matching.

        This performs a similarity-based search to find entities whose names
        match or are similar to the provided search term.

        Args:
            view_uuid (str): The UUID of the view to search within.
            name (str): The name or partial name to search for.
            page (int, optional): Page number for pagination. Defaults to 1.
            page_size (int, optional): Number of results per page. Defaults to 50.

        Returns:
            dict: JSON response containing paginated search results with
                matching entities and their similarity scores.

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/{name}/"
        params = {"page": page, "page_size": page_size}
        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def get_admin_boundaries(
        self, view_uuid, admin_level=None, geom="full_geom", format="geojson"
    ):
        """
        Get administrative boundaries for a specific level or all levels.

        This is a convenience method that can retrieve boundaries for a single
        administrative level or attempt to fetch all available levels.

        Args:
            view_uuid (str): The UUID of the view to query.
            admin_level (int, optional): Administrative level to retrieve
                (0=country, 1=region, etc.). If None, attempts to fetch all levels.
            geom (str, optional): Geometry inclusion level. Options:
                - "no_geom": No geometry data
                - "centroid": Only centroid points
                - "full_geom": Complete boundary geometries
                Defaults to "full_geom".
            format (str, optional): Response format ("json" or "geojson").
                Defaults to "geojson".

        Returns:
            dict: JSON/GeoJSON response containing administrative boundaries
                in the specified format. For GeoJSON, returns a FeatureCollection.

        Raises:
            requests.HTTPError: If the API request fails or parameters are invalid.
        """
        # Construct the endpoint based on whether admin_level is provided
        if admin_level is not None:
            endpoint = (
                f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
            )
        else:
            # For all levels, we need to fetch level 0 and then get children for each entity
            endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/list/"

        params = {
            "geom": geom,
            "format": format,
            "page_size": 100,
        }

        response = self._make_request("GET", endpoint, params=params)
        return response.json()

    def get_vector_tiles_url(self, view_info):
        """
        Generate an authenticated URL for accessing vector tiles.

        Vector tiles are used for efficient map rendering and can be consumed
        by mapping libraries like Mapbox GL JS or OpenLayers.

        Args:
            view_info (dict): Dictionary containing view information that must
                include a 'vector_tiles' key with the base vector tiles URL.

        Returns:
            str: Fully authenticated vector tiles URL with API key and user email
                parameters appended for access control.

        Raises:
            ValueError: If 'vector_tiles' key is not found in view_info.
        """
        if "vector_tiles" not in view_info:
            raise ValueError("Vector tiles URL not found in view information")

        vector_tiles_url = view_info["vector_tiles"]

        # Parse out the timestamp parameter if it exists
        if "?t=" in vector_tiles_url:
            base_url, timestamp = vector_tiles_url.split("?t=")
            return f"{base_url}?t={timestamp}&token={self.api_key}&georepo_user_key={self.email}"
        else:
            return (
                f"{vector_tiles_url}?token={self.api_key}&georepo_user_key={self.email}"
            )

    def find_country_by_iso3(self, view_uuid, iso3_code):
        """
        Find a country entity using its ISO3 country code.

        This method searches through all level-0 (country) entities to find
        one that matches the provided ISO3 code. It checks both the entity's
        Ucode and any external codes stored in the ext_codes field.

        Args:
            view_uuid (str): The UUID of the view to search within.
            iso3_code (str): The ISO3 country code to search for (e.g., 'USA', 'KEN', 'BRA').

        Returns:
            dict or None: Entity information dictionary for the matching country
                if found, including properties like name, ucode, admin_level, etc.
                Returns None if no matching country is found.

        Note:
            This method handles pagination automatically to search through all
            available countries in the dataset, which may involve multiple API calls.

        Raises:
            requests.HTTPError: If the API request fails or view_uuid is invalid.
        """
        # Admin level 0 represents countries
        endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/level/0/"
        params = {
            "page_size": 100,
            "geom": "no_geom",
        }

        # need to paginate since it can be a large dataset
        all_countries = []
        page = 1

        while True:
            params["page"] = page
            response = self._make_request("GET", endpoint, params=params)
            data = response.json()

            countries = data.get("results", [])
            all_countries.extend(countries)

            # check if there are more pages
            if page >= data.get("total_page", 1):
                break

            page += 1

        # Search by ISO3 code
        for country in all_countries:
            # Check if ISO3 code is in the ucode (typically at the beginning)
            if country["ucode"].startswith(iso3_code + "_"):
                return country

            # Also check in ext_codes which may contain the ISO3 code
            ext_codes = country.get("ext_codes", {})
            if ext_codes:
                # Check if ISO3 is directly in ext_codes
                if (
                    ext_codes.get("PCode", "") == iso3_code
                    or ext_codes.get("default", "") == iso3_code
                ):
                    return country

        return None
__init__(api_key=None, email=None)

Initialize the GeoRepo client.

Parameters:

Name Type Description Default
api_key str

GeoRepo API key. If not provided, will use the GEOREPO_API_KEY environment variable from config.

None
email str

Email address associated with the API key. If not provided, will use the GEOREPO_USER_EMAIL environment variable from config.

None

Raises:

Type Description
ValueError

If api_key or email is not provided and cannot be found in environment variables.

Source code in gigaspatial/handlers/unicef_georepo.py
def __init__(self, api_key=None, email=None):
    """
    Initialize the GeoRepo client.

    Args:
        api_key (str, optional): GeoRepo API key. If not provided, will use
            the GEOREPO_API_KEY environment variable from config.
        email (str, optional): Email address associated with the API key.
            If not provided, will use the GEOREPO_USER_EMAIL environment
            variable from config.

    Raises:
        ValueError: If api_key or email is not provided and cannot be found
            in environment variables.
    """
    self.base_url = "https://georepo.unicef.org/api/v1"
    self.api_key = api_key or config.GEOREPO_API_KEY
    self.email = email or config.GEOREPO_USER_EMAIL
    self.logger = config.get_logger(self.__class__.__name__)

    if not self.api_key:
        raise ValueError(
            "API Key is required. Provide it as a parameter or set GEOREPO_API_KEY environment variable."
        )

    if not self.email:
        raise ValueError(
            "Email is required. Provide it as a parameter or set GEOREPO_USER_EMAIL environment variable."
        )

    self.headers = {
        "Accept": "application/json",
        "Authorization": f"Token {self.api_key}",
        "GeoRepo-User-Key": self.email,
    }
check_connection()

Checks if the API connection is valid by making a simple request.

Returns:

Name Type Description
bool

True if the connection is valid, False otherwise.

Source code in gigaspatial/handlers/unicef_georepo.py
def check_connection(self):
    """
    Checks if the API connection is valid by making a simple request.

    Returns:
        bool: True if the connection is valid, False otherwise.
    """
    endpoint = f"{self.base_url}/search/module/list/"
    try:
        self._make_request("GET", endpoint)
        return True
    except requests.exceptions.HTTPError as e:
        return False
    except requests.exceptions.RequestException as e:
        raise requests.exceptions.RequestException(
            f"Connection check encountered a network error: {e}"
        )
find_country_by_iso3(view_uuid, iso3_code)

Find a country entity using its ISO3 country code.

This method searches through all level-0 (country) entities to find one that matches the provided ISO3 code. It checks both the entity's Ucode and any external codes stored in the ext_codes field.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to search within.

required
iso3_code str

The ISO3 country code to search for (e.g., 'USA', 'KEN', 'BRA').

required

Returns:

Type Description

dict or None: Entity information dictionary for the matching country if found, including properties like name, ucode, admin_level, etc. Returns None if no matching country is found.

Note

This method handles pagination automatically to search through all available countries in the dataset, which may involve multiple API calls.

Raises:

Type Description
HTTPError

If the API request fails or view_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def find_country_by_iso3(self, view_uuid, iso3_code):
    """
    Find a country entity using its ISO3 country code.

    This method searches through all level-0 (country) entities to find
    one that matches the provided ISO3 code. It checks both the entity's
    Ucode and any external codes stored in the ext_codes field.

    Args:
        view_uuid (str): The UUID of the view to search within.
        iso3_code (str): The ISO3 country code to search for (e.g., 'USA', 'KEN', 'BRA').

    Returns:
        dict or None: Entity information dictionary for the matching country
            if found, including properties like name, ucode, admin_level, etc.
            Returns None if no matching country is found.

    Note:
        This method handles pagination automatically to search through all
        available countries in the dataset, which may involve multiple API calls.

    Raises:
        requests.HTTPError: If the API request fails or view_uuid is invalid.
    """
    # Admin level 0 represents countries
    endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/level/0/"
    params = {
        "page_size": 100,
        "geom": "no_geom",
    }

    # need to paginate since it can be a large dataset
    all_countries = []
    page = 1

    while True:
        params["page"] = page
        response = self._make_request("GET", endpoint, params=params)
        data = response.json()

        countries = data.get("results", [])
        all_countries.extend(countries)

        # check if there are more pages
        if page >= data.get("total_page", 1):
            break

        page += 1

    # Search by ISO3 code
    for country in all_countries:
        # Check if ISO3 code is in the ucode (typically at the beginning)
        if country["ucode"].startswith(iso3_code + "_"):
            return country

        # Also check in ext_codes which may contain the ISO3 code
        ext_codes = country.get("ext_codes", {})
        if ext_codes:
            # Check if ISO3 is directly in ext_codes
            if (
                ext_codes.get("PCode", "") == iso3_code
                or ext_codes.get("default", "") == iso3_code
            ):
                return country

    return None
get_admin_boundaries(view_uuid, admin_level=None, geom='full_geom', format='geojson')

Get administrative boundaries for a specific level or all levels.

This is a convenience method that can retrieve boundaries for a single administrative level or attempt to fetch all available levels.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to query.

required
admin_level int

Administrative level to retrieve (0=country, 1=region, etc.). If None, attempts to fetch all levels.

None
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "full_geom".

'full_geom'
format str

Response format ("json" or "geojson"). Defaults to "geojson".

'geojson'

Returns:

Name Type Description
dict

JSON/GeoJSON response containing administrative boundaries in the specified format. For GeoJSON, returns a FeatureCollection.

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_admin_boundaries(
    self, view_uuid, admin_level=None, geom="full_geom", format="geojson"
):
    """
    Get administrative boundaries for a specific level or all levels.

    This is a convenience method that can retrieve boundaries for a single
    administrative level or attempt to fetch all available levels.

    Args:
        view_uuid (str): The UUID of the view to query.
        admin_level (int, optional): Administrative level to retrieve
            (0=country, 1=region, etc.). If None, attempts to fetch all levels.
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "full_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "geojson".

    Returns:
        dict: JSON/GeoJSON response containing administrative boundaries
            in the specified format. For GeoJSON, returns a FeatureCollection.

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    # Construct the endpoint based on whether admin_level is provided
    if admin_level is not None:
        endpoint = (
            f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
        )
    else:
        # For all levels, we need to fetch level 0 and then get children for each entity
        endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/list/"

    params = {
        "geom": geom,
        "format": format,
        "page_size": 100,
    }

    response = self._make_request("GET", endpoint, params=params)
    return response.json()
get_dataset_details(dataset_uuid)

Get detailed information about a specific dataset.

This includes metadata about the dataset and information about available administrative levels (e.g., country, province, district).

Parameters:

Name Type Description Default
dataset_uuid str

The UUID of the dataset to query.

required

Returns:

Name Type Description
dict

JSON response containing dataset details including: - Basic metadata (name, description, etc.) - Available administrative levels and their properties - Temporal information and data sources

Raises:

Type Description
HTTPError

If the API request fails or dataset_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_dataset_details(self, dataset_uuid):
    """
    Get detailed information about a specific dataset.

    This includes metadata about the dataset and information about
    available administrative levels (e.g., country, province, district).

    Args:
        dataset_uuid (str): The UUID of the dataset to query.

    Returns:
        dict: JSON response containing dataset details including:
            - Basic metadata (name, description, etc.)
            - Available administrative levels and their properties
            - Temporal information and data sources

    Raises:
        requests.HTTPError: If the API request fails or dataset_uuid is invalid.
    """
    endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/"
    response = self._make_request("GET", endpoint)
    return response.json()
get_entity_by_ucode(ucode, geom='full_geom', format='geojson')

Get detailed information about a specific entity using its Ucode.

A Ucode (Universal Code) is a unique identifier for geographic entities within the GeoRepo system, typically in the format "ISO3_LEVEL_NAME".

Parameters:

Name Type Description Default
ucode str

The unique code identifier for the entity.

required
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "full_geom".

'full_geom'
format str

Response format ("json" or "geojson"). Defaults to "geojson".

'geojson'

Returns:

Name Type Description
dict

JSON/GeoJSON response containing entity details including geometry, properties, administrative level, and metadata.

Raises:

Type Description
HTTPError

If the API request fails or ucode is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_entity_by_ucode(self, ucode, geom="full_geom", format="geojson"):
    """
    Get detailed information about a specific entity using its Ucode.

    A Ucode (Universal Code) is a unique identifier for geographic entities
    within the GeoRepo system, typically in the format "ISO3_LEVEL_NAME".

    Args:
        ucode (str): The unique code identifier for the entity.
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "full_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "geojson".

    Returns:
        dict: JSON/GeoJSON response containing entity details including
            geometry, properties, administrative level, and metadata.

    Raises:
        requests.HTTPError: If the API request fails or ucode is invalid.
    """
    endpoint = f"{self.base_url}/search/entity/ucode/{ucode}/"
    params = {"geom": geom, "format": format}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()
get_vector_tiles_url(view_info)

Generate an authenticated URL for accessing vector tiles.

Vector tiles are used for efficient map rendering and can be consumed by mapping libraries like Mapbox GL JS or OpenLayers.

Parameters:

Name Type Description Default
view_info dict

Dictionary containing view information that must include a 'vector_tiles' key with the base vector tiles URL.

required

Returns:

Name Type Description
str

Fully authenticated vector tiles URL with API key and user email parameters appended for access control.

Raises:

Type Description
ValueError

If 'vector_tiles' key is not found in view_info.

Source code in gigaspatial/handlers/unicef_georepo.py
def get_vector_tiles_url(self, view_info):
    """
    Generate an authenticated URL for accessing vector tiles.

    Vector tiles are used for efficient map rendering and can be consumed
    by mapping libraries like Mapbox GL JS or OpenLayers.

    Args:
        view_info (dict): Dictionary containing view information that must
            include a 'vector_tiles' key with the base vector tiles URL.

    Returns:
        str: Fully authenticated vector tiles URL with API key and user email
            parameters appended for access control.

    Raises:
        ValueError: If 'vector_tiles' key is not found in view_info.
    """
    if "vector_tiles" not in view_info:
        raise ValueError("Vector tiles URL not found in view information")

    vector_tiles_url = view_info["vector_tiles"]

    # Parse out the timestamp parameter if it exists
    if "?t=" in vector_tiles_url:
        base_url, timestamp = vector_tiles_url.split("?t=")
        return f"{base_url}?t={timestamp}&token={self.api_key}&georepo_user_key={self.email}"
    else:
        return (
            f"{vector_tiles_url}?token={self.api_key}&georepo_user_key={self.email}"
        )
list_datasets_by_module(module_uuid)

List all datasets within a specific module.

A dataset represents a collection of related geographic entities, such as administrative boundaries for a specific country or region.

Parameters:

Name Type Description Default
module_uuid str

The UUID of the module to query.

required

Returns:

Name Type Description
dict

JSON response containing a list of datasets with their metadata. Each dataset includes 'uuid', 'name', 'description', creation date, etc.

Raises:

Type Description
HTTPError

If the API request fails or module_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_datasets_by_module(self, module_uuid):
    """
    List all datasets within a specific module.

    A dataset represents a collection of related geographic entities,
    such as administrative boundaries for a specific country or region.

    Args:
        module_uuid (str): The UUID of the module to query.

    Returns:
        dict: JSON response containing a list of datasets with their metadata.
            Each dataset includes 'uuid', 'name', 'description', creation date, etc.

    Raises:
        requests.HTTPError: If the API request fails or module_uuid is invalid.
    """
    endpoint = f"{self.base_url}/search/module/{module_uuid}/dataset/list/"
    response = self._make_request("GET", endpoint)
    return response.json()
list_entities_by_admin_level(view_uuid, admin_level, geom='no_geom', format='json', page=1, page_size=50)

List entities at a specific administrative level within a view.

Administrative levels typically follow a hierarchy: - Level 0: Countries - Level 1: States/Provinces/Regions - Level 2: Districts/Counties - Level 3: Sub-districts/Municipalities - And so on...

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to query.

required
admin_level int

The administrative level to retrieve (0, 1, 2, etc.).

required
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "no_geom".

'no_geom'
format str

Response format ("json" or "geojson"). Defaults to "json".

'json'
page int

Page number for pagination. Defaults to 1.

1
page_size int

Number of results per page. Defaults to 50.

50

Returns:

Name Type Description
tuple

A tuple containing: - dict: JSON/GeoJSON response with entity data - dict: Metadata with pagination info (page, total_page, total_count)

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_entities_by_admin_level(
    self,
    view_uuid,
    admin_level,
    geom="no_geom",
    format="json",
    page=1,
    page_size=50,
):
    """
    List entities at a specific administrative level within a view.

    Administrative levels typically follow a hierarchy:
    - Level 0: Countries
    - Level 1: States/Provinces/Regions
    - Level 2: Districts/Counties
    - Level 3: Sub-districts/Municipalities
    - And so on...

    Args:
        view_uuid (str): The UUID of the view to query.
        admin_level (int): The administrative level to retrieve (0, 1, 2, etc.).
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "no_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "json".
        page (int, optional): Page number for pagination. Defaults to 1.
        page_size (int, optional): Number of results per page. Defaults to 50.

    Returns:
        tuple: A tuple containing:
            - dict: JSON/GeoJSON response with entity data
            - dict: Metadata with pagination info (page, total_page, total_count)

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    endpoint = (
        f"{self.base_url}/search/view/{view_uuid}/entity/level/{admin_level}/"
    )
    params = {"page": page, "page_size": page_size, "geom": geom, "format": format}
    response = self._make_request("GET", endpoint, params=params)

    metadata = {
        "page": int(response.headers.get("page", 1)),
        "total_page": int(response.headers.get("total_page", 1)),
        "total_count": int(response.headers.get("count", 0)),
    }

    return response.json(), metadata
list_entity_children(view_uuid, entity_ucode, geom='no_geom', format='json')

List direct children of an entity in the administrative hierarchy.

For example, if given a country entity, this will return its states/provinces. If given a state entity, this will return its districts/counties.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view containing the entity.

required
entity_ucode str

The Ucode of the parent entity.

required
geom str

Geometry inclusion level. Options: - "no_geom": No geometry data - "centroid": Only centroid points - "full_geom": Complete boundary geometries Defaults to "no_geom".

'no_geom'
format str

Response format ("json" or "geojson"). Defaults to "json".

'json'

Returns:

Name Type Description
dict

JSON/GeoJSON response containing list of child entities with their properties and optional geometry data.

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_entity_children(
    self, view_uuid, entity_ucode, geom="no_geom", format="json"
):
    """
    List direct children of an entity in the administrative hierarchy.

    For example, if given a country entity, this will return its states/provinces.
    If given a state entity, this will return its districts/counties.

    Args:
        view_uuid (str): The UUID of the view containing the entity.
        entity_ucode (str): The Ucode of the parent entity.
        geom (str, optional): Geometry inclusion level. Options:
            - "no_geom": No geometry data
            - "centroid": Only centroid points
            - "full_geom": Complete boundary geometries
            Defaults to "no_geom".
        format (str, optional): Response format ("json" or "geojson").
            Defaults to "json".

    Returns:
        dict: JSON/GeoJSON response containing list of child entities
            with their properties and optional geometry data.

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    endpoint = (
        f"{self.base_url}/search/view/{view_uuid}/entity/{entity_ucode}/children/"
    )
    params = {"geom": geom, "format": format}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()
list_modules()

List all available modules in GeoRepo.

A module is a top-level organizational unit that contains datasets. Examples include "Admin Boundaries", "Health Facilities", etc.

Returns:

Name Type Description
dict

JSON response containing a list of modules with their metadata. Each module includes 'uuid', 'name', 'description', and other properties.

Raises:

Type Description
HTTPError

If the API request fails.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_modules(self):
    """
    List all available modules in GeoRepo.

    A module is a top-level organizational unit that contains datasets.
    Examples include "Admin Boundaries", "Health Facilities", etc.

    Returns:
        dict: JSON response containing a list of modules with their metadata.
            Each module includes 'uuid', 'name', 'description', and other properties.

    Raises:
        requests.HTTPError: If the API request fails.
    """
    endpoint = f"{self.base_url}/search/module/list/"
    response = self._make_request("GET", endpoint)
    return response.json()
list_views_by_dataset(dataset_uuid, page=1, page_size=50)

List views for a dataset with pagination support.

A view represents a specific version or subset of a dataset. Views may be tagged as 'latest' or represent different time periods.

Parameters:

Name Type Description Default
dataset_uuid str

The UUID of the dataset to query.

required
page int

Page number for pagination. Defaults to 1.

1
page_size int

Number of results per page. Defaults to 50.

50

Returns:

Name Type Description
dict

JSON response containing paginated list of views with metadata. Includes 'results', 'total_page', 'current_page', and 'count' fields. Each view includes 'uuid', 'name', 'tags', and other properties.

Raises:

Type Description
HTTPError

If the API request fails or dataset_uuid is invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def list_views_by_dataset(self, dataset_uuid, page=1, page_size=50):
    """
    List views for a dataset with pagination support.

    A view represents a specific version or subset of a dataset.
    Views may be tagged as 'latest' or represent different time periods.

    Args:
        dataset_uuid (str): The UUID of the dataset to query.
        page (int, optional): Page number for pagination. Defaults to 1.
        page_size (int, optional): Number of results per page. Defaults to 50.

    Returns:
        dict: JSON response containing paginated list of views with metadata.
            Includes 'results', 'total_page', 'current_page', and 'count' fields.
            Each view includes 'uuid', 'name', 'tags', and other properties.

    Raises:
        requests.HTTPError: If the API request fails or dataset_uuid is invalid.
    """
    endpoint = f"{self.base_url}/search/dataset/{dataset_uuid}/view/list/"
    params = {"page": page, "page_size": page_size}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()
search_entities_by_name(view_uuid, name, page=1, page_size=50)

Search for entities by name using fuzzy matching.

This performs a similarity-based search to find entities whose names match or are similar to the provided search term.

Parameters:

Name Type Description Default
view_uuid str

The UUID of the view to search within.

required
name str

The name or partial name to search for.

required
page int

Page number for pagination. Defaults to 1.

1
page_size int

Number of results per page. Defaults to 50.

50

Returns:

Name Type Description
dict

JSON response containing paginated search results with matching entities and their similarity scores.

Raises:

Type Description
HTTPError

If the API request fails or parameters are invalid.

Source code in gigaspatial/handlers/unicef_georepo.py
def search_entities_by_name(self, view_uuid, name, page=1, page_size=50):
    """
    Search for entities by name using fuzzy matching.

    This performs a similarity-based search to find entities whose names
    match or are similar to the provided search term.

    Args:
        view_uuid (str): The UUID of the view to search within.
        name (str): The name or partial name to search for.
        page (int, optional): Page number for pagination. Defaults to 1.
        page_size (int, optional): Number of results per page. Defaults to 50.

    Returns:
        dict: JSON response containing paginated search results with
            matching entities and their similarity scores.

    Raises:
        requests.HTTPError: If the API request fails or parameters are invalid.
    """
    endpoint = f"{self.base_url}/search/view/{view_uuid}/entity/{name}/"
    params = {"page": page, "page_size": page_size}
    response = self._make_request("GET", endpoint, params=params)
    return response.json()

find_admin_boundaries_module()

Find and return the UUID of the Admin Boundaries module.

This is a convenience function that searches through all available modules to locate the one named "Admin Boundaries", which typically contains administrative boundary datasets.

Returns:

Name Type Description
str

The UUID of the Admin Boundaries module.

Raises:

Type Description
ValueError

If the Admin Boundaries module is not found.

Source code in gigaspatial/handlers/unicef_georepo.py
def find_admin_boundaries_module():
    """
    Find and return the UUID of the Admin Boundaries module.

    This is a convenience function that searches through all available modules
    to locate the one named "Admin Boundaries", which typically contains
    administrative boundary datasets.

    Returns:
        str: The UUID of the Admin Boundaries module.

    Raises:
        ValueError: If the Admin Boundaries module is not found.
    """
    client = GeoRepoClient()
    modules = client.list_modules()

    for module in modules.get("results", []):
        if module["name"] == "Admin Boundaries":
            return module["uuid"]

    raise ValueError("Admin Boundaries module not found")

get_country_boundaries_by_iso3(iso3_code, client=None, admin_level=None)

Get administrative boundaries for a specific country using its ISO3 code.

This function provides a high-level interface to retrieve country boundaries by automatically finding the appropriate module, dataset, and view, then fetching the requested administrative boundaries.

The function will: 1. Find the Admin Boundaries module 2. Locate a global dataset within that module 3. Find the latest view of that dataset 4. Search for the country using the ISO3 code 5. Look for a country-specific view if available 6. Retrieve boundaries at the specified admin level or all levels

Parameters:

Name Type Description Default
iso3_code str

The ISO3 country code (e.g., 'USA', 'KEN', 'BRA').

required
admin_level int

The administrative level to retrieve: - 0: Country level - 1: State/Province/Region level - 2: District/County level - 3: Sub-district/Municipality level - etc. If None, retrieves all available administrative levels.

None

Returns:

Name Type Description
dict

A GeoJSON FeatureCollection containing the requested boundaries. Each feature includes geometry and properties for the administrative unit.

Raises:

Type Description
ValueError

If the Admin Boundaries module, datasets, views, or country cannot be found.

HTTPError

If any API requests fail.

Note

This function may make multiple API calls and can take some time for countries with many administrative units. It handles pagination automatically and attempts to use country-specific views when available for better performance.

Example
Get all administrative levels for Kenya

boundaries = get_country_boundaries_by_iso3('KEN')

Get only province-level boundaries for Kenya

provinces = get_country_boundaries_by_iso3('KEN', admin_level=1)

Source code in gigaspatial/handlers/unicef_georepo.py
def get_country_boundaries_by_iso3(
    iso3_code, client: GeoRepoClient = None, admin_level=None
):
    """
    Get administrative boundaries for a specific country using its ISO3 code.

    This function provides a high-level interface to retrieve country boundaries
    by automatically finding the appropriate module, dataset, and view, then
    fetching the requested administrative boundaries.

    The function will:
    1. Find the Admin Boundaries module
    2. Locate a global dataset within that module
    3. Find the latest view of that dataset
    4. Search for the country using the ISO3 code
    5. Look for a country-specific view if available
    6. Retrieve boundaries at the specified admin level or all levels

    Args:
        iso3_code (str): The ISO3 country code (e.g., 'USA', 'KEN', 'BRA').
        admin_level (int, optional): The administrative level to retrieve:
            - 0: Country level
            - 1: State/Province/Region level
            - 2: District/County level
            - 3: Sub-district/Municipality level
            - etc.
            If None, retrieves all available administrative levels.

    Returns:
        dict: A GeoJSON FeatureCollection containing the requested boundaries.
            Each feature includes geometry and properties for the administrative unit.

    Raises:
        ValueError: If the Admin Boundaries module, datasets, views, or country
            cannot be found.
        requests.HTTPError: If any API requests fail.

    Note:
        This function may make multiple API calls and can take some time for
        countries with many administrative units. It handles pagination
        automatically and attempts to use country-specific views when available
        for better performance.

    Example:
        >>> # Get all administrative levels for Kenya
        >>> boundaries = get_country_boundaries_by_iso3('KEN')
        >>>
        >>> # Get only province-level boundaries for Kenya
        >>> provinces = get_country_boundaries_by_iso3('KEN', admin_level=1)
    """
    client = client or GeoRepoClient()

    client.logger.info("Finding Admin Boundaries module...")
    modules = client.list_modules()
    admin_module_uuid = None

    for module in modules.get("results", []):
        if "Admin Boundaries" in module["name"]:
            admin_module_uuid = module["uuid"]
            client.logger.info(
                f"Found Admin Boundaries module: {module['name']} ({admin_module_uuid})"
            )
            break

    if not admin_module_uuid:
        raise ValueError("Admin Boundaries module not found")

    client.logger.info(f"Finding datasets in the Admin Boundaries module...")
    datasets = client.list_datasets_by_module(admin_module_uuid)
    global_dataset_uuid = None

    for dataset in datasets.get("results", []):
        if any(keyword in dataset["name"].lower() for keyword in ["global"]):
            global_dataset_uuid = dataset["uuid"]
            client.logger.info(
                f"Found global dataset: {dataset['name']} ({global_dataset_uuid})"
            )
            break

    if not global_dataset_uuid:
        if datasets.get("results"):
            global_dataset_uuid = datasets["results"][0]["uuid"]
            client.logger.info(
                f"Using first available dataset: {datasets['results'][0]['name']} ({global_dataset_uuid})"
            )
        else:
            raise ValueError("No datasets found in the Admin Boundaries module")

    client.logger.info(f"Finding views in the dataset...")
    views = client.list_views_by_dataset(global_dataset_uuid)
    latest_view_uuid = None

    for view in views.get("results", []):
        if "tags" in view and "latest" in view["tags"]:
            latest_view_uuid = view["uuid"]
            client.logger.info(
                f"Found latest view: {view['name']} ({latest_view_uuid})"
            )
            break

    if not latest_view_uuid:
        if views.get("results"):
            latest_view_uuid = views["results"][0]["uuid"]
            client.logger.info(
                f"Using first available view: {views['results'][0]['name']} ({latest_view_uuid})"
            )
        else:
            raise ValueError("No views found in the dataset")

    # Search for the country by ISO3 code
    client.logger.info(f"Searching for country with ISO3 code: {iso3_code}...")
    country_entity = client.find_country_by_iso3(latest_view_uuid, iso3_code)

    if not country_entity:
        raise ValueError(f"Country with ISO3 code '{iso3_code}' not found")

    country_ucode = country_entity["ucode"]
    country_name = country_entity["name"]
    client.logger.info(f"Found country: {country_name} (Ucode: {country_ucode})")

    # Search for country-specific view
    client.logger.info(f"Checking for country-specific view...")
    country_view_uuid = None
    all_views = []

    # Need to fetch all pages of views
    page = 1
    while True:
        views_page = client.list_views_by_dataset(global_dataset_uuid, page=page)
        all_views.extend(views_page.get("results", []))
        if page >= views_page.get("total_page", 1):
            break
        page += 1

    # Look for a view specifically for this country
    for view in all_views:
        if country_name.lower() in view["name"].lower() and "latest" in view.get(
            "tags", []
        ):
            country_view_uuid = view["uuid"]
            client.logger.info(
                f"Found country-specific view: {view['name']} ({country_view_uuid})"
            )
            break

    # Get boundaries based on admin level
    if country_view_uuid:
        client.logger.info(country_view_uuid)
        # If we found a view specific to this country, use it
        client.logger.info(f"Getting admin boundaries from country-specific view...")
        if admin_level is not None:
            client.logger.info(f"Fetching admin level {admin_level} boundaries...")

            # Handle pagination for large datasets
            all_features = []
            page = 1
            while True:
                result, meta = client.list_entities_by_admin_level(
                    country_view_uuid,
                    admin_level,
                    geom="full_geom",
                    format="geojson",
                    page=page,
                    page_size=50,
                )

                # Add features to our collection
                if "features" in result:
                    all_features.extend(result["features"])
                elif "results" in result:
                    # Convert entities to GeoJSON features if needed
                    for entity in result["results"]:
                        if "geometry" in entity:
                            feature = {
                                "type": "Feature",
                                "properties": {
                                    k: v for k, v in entity.items() if k != "geometry"
                                },
                                "geometry": entity["geometry"],
                            }
                            all_features.append(feature)

                # Check if there are more pages
                if page >= meta["total_page"]:
                    break

                page += 1

            boundaries = {"type": "FeatureCollection", "features": all_features}
        else:
            # Get all admin levels by fetching each level separately
            boundaries = {"type": "FeatureCollection", "features": []}

            # Get dataset details to find available admin levels
            dataset_details = client.get_dataset_details(global_dataset_uuid)
            max_level = 0

            for level_info in dataset_details.get("dataset_levels", []):
                if isinstance(level_info.get("level"), int):
                    max_level = max(max_level, level_info["level"])

            client.logger.info(f"Dataset has admin levels from 0 to {max_level}")

            # Fetch each admin level
            for level in range(max_level + 1):
                client.logger.info(f"Fetching admin level {level}...")
                try:
                    level_data, meta = client.list_entities_by_admin_level(
                        country_view_uuid, level, geom="full_geom", format="geojson"
                    )

                    if "features" in level_data:
                        boundaries["features"].extend(level_data["features"])
                    elif "results" in level_data:
                        # Process each page of results
                        page = 1
                        while True:
                            result, meta = client.list_entities_by_admin_level(
                                country_view_uuid,
                                level,
                                geom="full_geom",
                                format="geojson",
                                page=page,
                            )

                            if "features" in result:
                                boundaries["features"].extend(result["features"])

                            # Check for more pages
                            if page >= meta["total_page"]:
                                break

                            page += 1

                except Exception as e:
                    client.logger.warning(f"Error fetching admin level {level}: {e}")
    else:
        # Use the global view with filtering
        client.logger.info(f"Using global view and filtering by country...")

        # Function to recursively get all descendants
        def get_all_children(
            parent_ucode, view_uuid, level=1, max_depth=5, admin_level_filter=None
        ):
            """
            Recursively retrieve all child entities of a parent entity.

            Args:
                parent_ucode (str): The Ucode of the parent entity.
                view_uuid (str): The UUID of the view to query.
                level (int): Current recursion level (for depth limiting).
                max_depth (int): Maximum recursion depth to prevent infinite loops.
                admin_level_filter (int, optional): If specified, only return
                    entities at this specific administrative level.

            Returns:
                list: List of GeoJSON features for all child entities.
            """
            if level > max_depth:
                return []

            try:
                children = client.list_entity_children(view_uuid, parent_ucode)
                features = []

                for child in children.get("results", []):
                    # Skip if we're filtering by admin level and this doesn't match
                    if (
                        admin_level_filter is not None
                        and child.get("admin_level") != admin_level_filter
                    ):
                        continue

                    # Get the child with full geometry
                    child_entity = client.get_entity_by_ucode(child["ucode"])
                    if "features" in child_entity:
                        features.extend(child_entity["features"])

                    # Recursively get grandchildren if not filtering by admin level
                    if admin_level_filter is None:
                        features.extend(
                            get_all_children(
                                child["ucode"], view_uuid, level + 1, max_depth
                            )
                        )

                return features
            except Exception as e:
                client.logger.warning(f"Error getting children for {parent_ucode}: {e}")
                return []

        # Start with the country boundaries
        boundaries = {"type": "FeatureCollection", "features": []}

        # If admin_level is 0, just get the country entity
        if admin_level == 0:
            country_entity = client.get_entity_by_ucode(country_ucode)
            if "features" in country_entity:
                boundaries["features"].extend(country_entity["features"])
        # If specific admin level requested, get all entities at that level
        elif admin_level is not None:
            children_features = get_all_children(
                country_ucode,
                latest_view_uuid,
                max_depth=admin_level + 1,
                admin_level_filter=admin_level,
            )
            boundaries["features"].extend(children_features)
        # If no admin_level specified, get all levels
        else:
            # Start with the country entity
            country_entity = client.get_entity_by_ucode(country_ucode)
            if "features" in country_entity:
                boundaries["features"].extend(country_entity["features"])

            # Get all descendants
            children_features = get_all_children(
                country_ucode, latest_view_uuid, max_depth=5
            )
            boundaries["features"].extend(children_features)

    return boundaries

worldpop

WorldPopConfig

Bases: BaseModel

Source code in gigaspatial/handlers/worldpop.py
class WorldPopConfig(BaseModel):
    # class variables
    _metadata_cache: ClassVar[Optional[pd.DataFrame]] = None

    # constants
    CURRENT_MAX_YEAR: int = 2022
    EARLIEST_YEAR: int = 2000
    SCHOOL_AGE_YEAR: int = 2020

    # base config
    WORLDPOP_DB_BASE_URL: HttpUrl = Field(default="https://data.worldpop.org/")
    SCHOOL_AGE_POPULATION_PATH: str = Field(
        default="GIS/AgeSex_structures/school_age_population/v1/2020/"
    )
    PPP_2021_2022_PATH: str = Field(
        default="GIS/Population/Global_2021_2022_1km_UNadj/"
    )
    DATASETS_METADATA_PATH: str = Field(default="assets/wpgpDatasets.csv")

    # user config
    base_path: Path = Field(default=global_config.get_path("worldpop", "bronze"))
    country: str = Field(...)
    year: int = Field(..., ge=EARLIEST_YEAR, le=CURRENT_MAX_YEAR)
    resolution: Literal["HIGH", "LOW"] = Field(
        default="LOW",
        description="Spatial resolution of the population grid: HIGH (100m) or LOW (1km)",
    )
    un_adjusted: bool = True
    constrained: bool = False
    school_age: Optional[Literal["PRIMARY", "SECONDARY"]] = None
    gender: Literal["F", "M", "F_M"] = "F_M"

    @field_validator("country")
    def validate_country(cls, value: str) -> str:
        try:
            return pycountry.countries.lookup(value).alpha_3
        except LookupError:
            raise ValueError(f"Invalid country code provided: {value}")

    @model_validator(mode="after")
    def validate_configuration(self):
        """
        Validate that the configuration is valid based on dataset availability constraints.

        Specific rules:
        - Post-2020 data is only available at 1km resolution with UN adjustment
        - School age population data is only available for 2020 at 1km resolution
        """
        if self.year > self.SCHOOL_AGE_YEAR:
            if self.resolution != "LOW":
                raise ValueError(
                    f"Data for year {self.year} is only available at LOW (1km) resolution"
                )

            if not self.un_adjusted:
                raise ValueError(
                    f"Data for year {self.year} is only available with UN adjustment"
                )

        if self.school_age:
            if self.resolution != "LOW":
                raise ValueError(
                    f"School age data is only available at LOW (1km) resolution"
                )

            if self.year != self.SCHOOL_AGE_YEAR:
                self.year = self.SCHOOL_AGE_YEAR
                raise ValueError(f"School age data is only available for 2020")

        return self

    @property
    def dataset_url(self) -> str:
        """Get the URL for the configured dataset. The URL is computed on first access and then cached for subsequent calls."""
        if not hasattr(self, "_dataset_url"):
            self._dataset_url = self._compute_dataset_url()
        return self._dataset_url

    @property
    def dataset_path(self) -> Path:
        """Construct and return the path for the configured dataset."""
        url_parts = self.dataset_url.split("/")
        file_path = (
            "/".join(
                [url_parts[4], url_parts[5], url_parts[7], self.country, url_parts[-1]]
            )
            if self.school_age
            else "/".join([url_parts[4], url_parts[6], self.country, url_parts[-1]])
        )
        return self.base_path / file_path

    def _load_datasets_metadata(self) -> pd.DataFrame:
        """Load and return the WorldPop datasets metadata, using cache if available."""
        if WorldPopConfig._metadata_cache is not None:
            return WorldPopConfig._metadata_cache

        try:
            WorldPopConfig._metadata_cache = pd.read_csv(
                str(self.WORLDPOP_DB_BASE_URL) + self.DATASETS_METADATA_PATH
            )
            return WorldPopConfig._metadata_cache
        except (URLError, pd.errors.EmptyDataError) as e:
            raise RuntimeError(f"Failed to load WorldPop datasets metadata: {e}")

    def _compute_dataset_url(self) -> str:
        """Construct and return the URL for the configured dataset."""
        # handle post-2020 datasets
        if self.year > self.SCHOOL_AGE_YEAR:
            return (
                str(self.WORLDPOP_DB_BASE_URL)
                + self.PPP_2021_2022_PATH
                + f"{'' if self.constrained else 'un'}constrained/{self.year}/{self.country}/{self.country.lower()}_ppp_{self.year}_1km_UNadj{'_constrained' if self.constrained else ''}.tif"
            )

        # handle school-age population datasets
        if self.school_age:
            return (
                str(self.WORLDPOP_DB_BASE_URL)
                + self.SCHOOL_AGE_POPULATION_PATH
                + f"{self.country}/{self.country}_SAP_1km_2020/{self.country}_{self.gender}_{self.school_age}_2020_1km.tif"
            )

        # handle standard population datasets
        wp_metadata = self._load_datasets_metadata()

        try:
            dataset_url = (
                self.WORLDPOP_DB_BASE_URL
                + wp_metadata[
                    (wp_metadata.ISO3 == self.country)
                    & (
                        wp_metadata.Covariate
                        == "ppp_"
                        + str(self.year)
                        + ("_UNadj" if self.un_adjusted else "")
                    )
                ].PathToRaster.values[0]
            )
        except IndexError:
            raise ValueError(
                f"No dataset found for country={self.country}, year={self.year}, un_adjusted={self.un_adjusted}"
            )

        # handle resolution conversion if needed
        if self.resolution == "HIGH":
            return dataset_url

        url_parts = dataset_url.split("/")
        url_parts[5] = (
            url_parts[5] + "_1km" + ("_UNadj" if self.un_adjusted else "")
        )  # get 1km folder with UNadj specification
        url_parts[8] = url_parts[8].replace(
            str(self.year), str(self.year) + "_1km_Aggregated"
        )  # get filename with 1km res
        dataset_url = "/".join(url_parts)

        return dataset_url

    def __repr__(self) -> str:

        parts = [
            f"WorldpopConfig(",
            f"  country='{self.country}'",
            f"  year={self.year}",
            f"  resolution={self.resolution}",
            f"  un_adjusted={self.un_adjusted}",
            f"  constrained={self.constrained}",
        ]

        if self.school_age:
            parts.append(f"  school_age='{self.school_age}'")
            parts.append(f"  gender='{self.gender}'")

        parts.append(")")

        return "\n".join(parts)
dataset_path: Path property

Construct and return the path for the configured dataset.

dataset_url: str property

Get the URL for the configured dataset. The URL is computed on first access and then cached for subsequent calls.

validate_configuration()

Validate that the configuration is valid based on dataset availability constraints.

Specific rules: - Post-2020 data is only available at 1km resolution with UN adjustment - School age population data is only available for 2020 at 1km resolution

Source code in gigaspatial/handlers/worldpop.py
@model_validator(mode="after")
def validate_configuration(self):
    """
    Validate that the configuration is valid based on dataset availability constraints.

    Specific rules:
    - Post-2020 data is only available at 1km resolution with UN adjustment
    - School age population data is only available for 2020 at 1km resolution
    """
    if self.year > self.SCHOOL_AGE_YEAR:
        if self.resolution != "LOW":
            raise ValueError(
                f"Data for year {self.year} is only available at LOW (1km) resolution"
            )

        if not self.un_adjusted:
            raise ValueError(
                f"Data for year {self.year} is only available with UN adjustment"
            )

    if self.school_age:
        if self.resolution != "LOW":
            raise ValueError(
                f"School age data is only available at LOW (1km) resolution"
            )

        if self.year != self.SCHOOL_AGE_YEAR:
            self.year = self.SCHOOL_AGE_YEAR
            raise ValueError(f"School age data is only available for 2020")

    return self

WorldPopDownloader

A class to handle downloads of WorldPop datasets.

Source code in gigaspatial/handlers/worldpop.py
class WorldPopDownloader:
    """A class to handle downloads of WorldPop datasets."""

    def __init__(
        self,
        config: Union[WorldPopConfig, dict[str, Union[str, int]]],
        data_store: Optional[DataStore] = None,
        logger: Optional[logging.Logger] = None,
    ):
        """
        Initialize the downloader.

        Args:
            config: Configuration for the WorldPop dataset, either as a WorldPopConfig object or a dictionary of parameters
            data_store: Optional data storage interface. If not provided, uses LocalDataStore.
            logger: Optional custom logger. If not provided, uses default logger.
        """
        self.logger = logger or global_config.get_logger(self.__class__.__name__)
        self.data_store = data_store or LocalDataStore()
        self.config = (
            config if isinstance(config, WorldPopConfig) else WorldPopConfig(**config)
        )

    @classmethod
    def from_country_year(cls, country: str, year: int, **kwargs):
        """
        Create a downloader instance from country and year.

        Args:
            country: Country code or name
            year: Year of the dataset
            **kwargs: Additional parameters for WorldPopConfig or the downloader
        """
        return cls({"country": country, "year": year}, **kwargs)

    def download_dataset(self) -> str:
        """
        Download the configured dataset to the provided output path.
        """

        try:
            response = requests.get(self.config.dataset_url, stream=True)
            response.raise_for_status()

            output_path = str(self.config.dataset_path)

            total_size = int(response.headers.get("content-length", 0))

            with self.data_store.open(output_path, "wb") as file:
                with tqdm(
                    total=total_size,
                    unit="B",
                    unit_scale=True,
                    desc=f"Downloading {os.path.basename(output_path)}",
                ) as pbar:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            file.write(chunk)
                            pbar.update(len(chunk))

            self.logger.debug(f"Successfully downloaded dataset: {self.config}")

            return output_path

        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to download dataset {self.config}: {str(e)}")
            return None
        except Exception as e:
            self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
            return None
__init__(config, data_store=None, logger=None)

Initialize the downloader.

Parameters:

Name Type Description Default
config Union[WorldPopConfig, dict[str, Union[str, int]]]

Configuration for the WorldPop dataset, either as a WorldPopConfig object or a dictionary of parameters

required
data_store Optional[DataStore]

Optional data storage interface. If not provided, uses LocalDataStore.

None
logger Optional[Logger]

Optional custom logger. If not provided, uses default logger.

None
Source code in gigaspatial/handlers/worldpop.py
def __init__(
    self,
    config: Union[WorldPopConfig, dict[str, Union[str, int]]],
    data_store: Optional[DataStore] = None,
    logger: Optional[logging.Logger] = None,
):
    """
    Initialize the downloader.

    Args:
        config: Configuration for the WorldPop dataset, either as a WorldPopConfig object or a dictionary of parameters
        data_store: Optional data storage interface. If not provided, uses LocalDataStore.
        logger: Optional custom logger. If not provided, uses default logger.
    """
    self.logger = logger or global_config.get_logger(self.__class__.__name__)
    self.data_store = data_store or LocalDataStore()
    self.config = (
        config if isinstance(config, WorldPopConfig) else WorldPopConfig(**config)
    )
download_dataset()

Download the configured dataset to the provided output path.

Source code in gigaspatial/handlers/worldpop.py
def download_dataset(self) -> str:
    """
    Download the configured dataset to the provided output path.
    """

    try:
        response = requests.get(self.config.dataset_url, stream=True)
        response.raise_for_status()

        output_path = str(self.config.dataset_path)

        total_size = int(response.headers.get("content-length", 0))

        with self.data_store.open(output_path, "wb") as file:
            with tqdm(
                total=total_size,
                unit="B",
                unit_scale=True,
                desc=f"Downloading {os.path.basename(output_path)}",
            ) as pbar:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        file.write(chunk)
                        pbar.update(len(chunk))

        self.logger.debug(f"Successfully downloaded dataset: {self.config}")

        return output_path

    except requests.exceptions.RequestException as e:
        self.logger.error(f"Failed to download dataset {self.config}: {str(e)}")
        return None
    except Exception as e:
        self.logger.error(f"Unexpected error downloading dataset: {str(e)}")
        return None
from_country_year(country, year, **kwargs) classmethod

Create a downloader instance from country and year.

Parameters:

Name Type Description Default
country str

Country code or name

required
year int

Year of the dataset

required
**kwargs

Additional parameters for WorldPopConfig or the downloader

{}
Source code in gigaspatial/handlers/worldpop.py
@classmethod
def from_country_year(cls, country: str, year: int, **kwargs):
    """
    Create a downloader instance from country and year.

    Args:
        country: Country code or name
        year: Year of the dataset
        **kwargs: Additional parameters for WorldPopConfig or the downloader
    """
    return cls({"country": country, "year": year}, **kwargs)

read_dataset(data_store, path, compression=None, **kwargs)

Read data from various file formats stored in both local and cloud-based storage.

Parameters:

data_store : DataStore Instance of DataStore for accessing data storage. path : str, Path Path to the file in data storage. **kwargs : dict Additional arguments passed to the specific reader function.

Returns:

pandas.DataFrame or geopandas.GeoDataFrame The data read from the file.

Raises:

FileNotFoundError If the file doesn't exist in blob storage. ValueError If the file type is unsupported or if there's an error reading the file.

Source code in gigaspatial/core/io/readers.py
def read_dataset(data_store: DataStore, path: str, compression: str = None, **kwargs):
    """
    Read data from various file formats stored in both local and cloud-based storage.

    Parameters:
    ----------
    data_store : DataStore
        Instance of DataStore for accessing data storage.
    path : str, Path
        Path to the file in data storage.
    **kwargs : dict
        Additional arguments passed to the specific reader function.

    Returns:
    -------
    pandas.DataFrame or geopandas.GeoDataFrame
        The data read from the file.

    Raises:
    ------
    FileNotFoundError
        If the file doesn't exist in blob storage.
    ValueError
        If the file type is unsupported or if there's an error reading the file.
    """

    # Define supported file formats and their readers
    BINARY_FORMATS = {
        ".shp",
        ".zip",
        ".parquet",
        ".gpkg",
        ".xlsx",
        ".xls",
        ".kmz",
        ".gz",
    }

    PANDAS_READERS = {
        ".csv": pd.read_csv,
        ".xlsx": lambda f, **kw: pd.read_excel(f, engine="openpyxl", **kw),
        ".xls": lambda f, **kw: pd.read_excel(f, engine="xlrd", **kw),
        ".json": pd.read_json,
        # ".gz": lambda f, **kw: pd.read_csv(f, compression="gzip", **kw),
    }

    GEO_READERS = {
        ".shp": gpd.read_file,
        ".zip": gpd.read_file,
        ".geojson": gpd.read_file,
        ".gpkg": gpd.read_file,
        ".parquet": gpd.read_parquet,
        ".kmz": read_kmz,
    }

    COMPRESSION_FORMATS = {
        ".gz": "gzip",
        ".bz2": "bz2",
        ".zip": "zip",
        ".xz": "xz",
    }

    try:
        # Check if file exists
        if not data_store.file_exists(path):
            raise FileNotFoundError(f"File '{path}' not found in blob storage")

        path_obj = Path(path)
        suffixes = path_obj.suffixes
        file_extension = suffixes[-1].lower() if suffixes else ""

        if compression is None and file_extension in COMPRESSION_FORMATS:
            compression_format = COMPRESSION_FORMATS[file_extension]

            # if file has multiple extensions (e.g., .csv.gz), get the inner format
            if len(suffixes) > 1:
                inner_extension = suffixes[-2].lower()

                if inner_extension == ".tar":
                    raise ValueError(
                        "Tar archives (.tar.gz) are not directly supported"
                    )

                if inner_extension in PANDAS_READERS:
                    try:
                        with data_store.open(path, "rb") as f:
                            return PANDAS_READERS[inner_extension](
                                f, compression=compression_format, **kwargs
                            )
                    except Exception as e:
                        raise ValueError(f"Error reading compressed file: {str(e)}")
                elif inner_extension in GEO_READERS:
                    try:
                        with data_store.open(path, "rb") as f:
                            if compression_format == "gzip":
                                import gzip

                                decompressed_data = gzip.decompress(f.read())
                                import io

                                return GEO_READERS[inner_extension](
                                    io.BytesIO(decompressed_data), **kwargs
                                )
                            else:
                                raise ValueError(
                                    f"Compression format {compression_format} not supported for geo data"
                                )
                    except Exception as e:
                        raise ValueError(f"Error reading compressed geo file: {str(e)}")
            else:
                # if just .gz without clear inner type, assume csv
                try:
                    with data_store.open(path, "rb") as f:
                        return pd.read_csv(f, compression=compression_format, **kwargs)
                except Exception as e:
                    raise ValueError(
                        f"Error reading compressed file as CSV: {str(e)}. "
                        f"If not a CSV, specify the format in the filename (e.g., .json.gz)"
                    )

        # Special handling for compressed files
        if file_extension == ".zip":
            # For zip files, we need to use binary mode
            with data_store.open(path, "rb") as f:
                return gpd.read_file(f)

        # Determine if we need binary mode based on file type
        mode = "rb" if file_extension in BINARY_FORMATS else "r"

        # Try reading with appropriate reader
        if file_extension in PANDAS_READERS:
            try:
                with data_store.open(path, mode) as f:
                    return PANDAS_READERS[file_extension](f, **kwargs)
            except Exception as e:
                raise ValueError(f"Error reading file with pandas: {str(e)}")

        if file_extension in GEO_READERS:
            try:
                with data_store.open(path, "rb") as f:
                    return GEO_READERS[file_extension](f, **kwargs)
            except Exception as e:
                # For parquet files, try pandas reader if geopandas fails
                if file_extension == ".parquet":
                    try:
                        with data_store.open(path, "rb") as f:
                            return pd.read_parquet(f, **kwargs)
                    except Exception as e2:
                        raise ValueError(
                            f"Failed to read parquet with both geopandas ({str(e)}) "
                            f"and pandas ({str(e2)})"
                        )
                raise ValueError(f"Error reading file with geopandas: {str(e)}")

        # If we get here, the file type is unsupported
        supported_formats = sorted(set(PANDAS_READERS.keys()) | set(GEO_READERS.keys()))
        supported_compressions = sorted(COMPRESSION_FORMATS.keys())
        raise ValueError(
            f"Unsupported file type: {file_extension}\n"
            f"Supported formats: {', '.join(supported_formats)}"
            f"Supported compressions: {', '.join(supported_compressions)}"
        )

    except Exception as e:
        if isinstance(e, (FileNotFoundError, ValueError)):
            raise
        raise RuntimeError(f"Unexpected error reading dataset: {str(e)}")

read_datasets(data_store, paths, **kwargs)

Read multiple datasets from data storage at once.

Parameters:

data_store : DataStore Instance of DataStore for accessing data storage. paths : list of str Paths to files in data storage. **kwargs : dict Additional arguments passed to read_dataset.

Returns:

dict Dictionary mapping paths to their corresponding DataFrames/GeoDataFrames.

Source code in gigaspatial/core/io/readers.py
def read_datasets(data_store: DataStore, paths, **kwargs):
    """
    Read multiple datasets from data storage at once.

    Parameters:
    ----------
    data_store : DataStore
        Instance of DataStore for accessing data storage.
    paths : list of str
        Paths to files in data storage.
    **kwargs : dict
        Additional arguments passed to read_dataset.

    Returns:
    -------
    dict
        Dictionary mapping paths to their corresponding DataFrames/GeoDataFrames.
    """
    results = {}
    errors = {}

    for path in paths:
        try:
            results[path] = read_dataset(data_store, path, **kwargs)
        except Exception as e:
            errors[path] = str(e)

    if errors:
        error_msg = "\n".join(f"- {path}: {error}" for path, error in errors.items())
        raise ValueError(f"Errors reading datasets:\n{error_msg}")

    return results

read_gzipped_json_or_csv(file_path, data_store)

Reads a gzipped file, attempting to parse it as JSON (lines=True) or CSV.

Source code in gigaspatial/core/io/readers.py
def read_gzipped_json_or_csv(file_path, data_store):
    """Reads a gzipped file, attempting to parse it as JSON (lines=True) or CSV."""

    with data_store.open(file_path, "rb") as f:
        g = gzip.GzipFile(fileobj=f)
        text = g.read().decode("utf-8")
        try:
            df = pd.read_json(io.StringIO(text), lines=True)
            return df
        except json.JSONDecodeError:
            try:
                df = pd.read_csv(io.StringIO(text))
                return df
            except pd.errors.ParserError:
                print(f"Error: Could not parse {file_path} as JSON or CSV.")
                return None

read_kmz(file_obj, **kwargs)

Helper function to read KMZ files and return a GeoDataFrame.

Source code in gigaspatial/core/io/readers.py
def read_kmz(file_obj, **kwargs):
    """Helper function to read KMZ files and return a GeoDataFrame."""
    try:
        with zipfile.ZipFile(file_obj) as kmz:
            # Find the KML file in the archive (usually doc.kml)
            kml_filename = next(
                name for name in kmz.namelist() if name.endswith(".kml")
            )

            # Read the KML content
            kml_content = io.BytesIO(kmz.read(kml_filename))

            gdf = gpd.read_file(kml_content)

            # Validate the GeoDataFrame
            if gdf.empty:
                raise ValueError(
                    "The KML file is empty or does not contain valid geospatial data."
                )

        return gdf

    except zipfile.BadZipFile:
        raise ValueError("The provided file is not a valid KMZ file.")
    except StopIteration:
        raise ValueError("No KML file found in the KMZ archive.")
    except Exception as e:
        raise RuntimeError(f"An error occurred: {e}")

write_dataset(data, data_store, path, **kwargs)

Write DataFrame or GeoDataFrame to various file formats in Azure Blob Storage.

Parameters:

data : pandas.DataFrame or geopandas.GeoDataFrame The data to write to blob storage. data_store : DataStore Instance of DataStore for accessing data storage. path : str Path where the file will be written in data storage. **kwargs : dict Additional arguments passed to the specific writer function.

Raises:

ValueError If the file type is unsupported or if there's an error writing the file. TypeError If input data is not a DataFrame or GeoDataFrame.

Source code in gigaspatial/core/io/writers.py
def write_dataset(data, data_store: DataStore, path, **kwargs):
    """
    Write DataFrame or GeoDataFrame to various file formats in Azure Blob Storage.

    Parameters:
    ----------
    data : pandas.DataFrame or geopandas.GeoDataFrame
        The data to write to blob storage.
    data_store : DataStore
        Instance of DataStore for accessing data storage.
    path : str
        Path where the file will be written in data storage.
    **kwargs : dict
        Additional arguments passed to the specific writer function.

    Raises:
    ------
    ValueError
        If the file type is unsupported or if there's an error writing the file.
    TypeError
        If input data is not a DataFrame or GeoDataFrame.
    """

    # Define supported file formats and their writers
    BINARY_FORMATS = {".shp", ".zip", ".parquet", ".gpkg", ".xlsx", ".xls"}

    PANDAS_WRITERS = {
        ".csv": lambda df, buf, **kw: df.to_csv(buf, **kw),
        ".xlsx": lambda df, buf, **kw: df.to_excel(buf, engine="openpyxl", **kw),
        ".json": lambda df, buf, **kw: df.to_json(buf, **kw),
        ".parquet": lambda df, buf, **kw: df.to_parquet(buf, **kw),
    }

    GEO_WRITERS = {
        ".geojson": lambda gdf, buf, **kw: gdf.to_file(buf, driver="GeoJSON", **kw),
        ".gpkg": lambda gdf, buf, **kw: gdf.to_file(buf, driver="GPKG", **kw),
        ".parquet": lambda gdf, buf, **kw: gdf.to_parquet(buf, **kw),
    }

    try:
        # Input validation
        if not isinstance(data, (pd.DataFrame, gpd.GeoDataFrame)):
            raise TypeError("Input data must be a pandas DataFrame or GeoDataFrame")

        # Get file suffix and ensure it's lowercase
        suffix = Path(path).suffix.lower()

        # Determine if we need binary mode based on file type
        mode = "wb" if suffix in BINARY_FORMATS else "w"

        # Handle different data types and formats
        if isinstance(data, gpd.GeoDataFrame):
            if suffix not in GEO_WRITERS:
                supported_formats = sorted(GEO_WRITERS.keys())
                raise ValueError(
                    f"Unsupported file type for GeoDataFrame: {suffix}\n"
                    f"Supported formats: {', '.join(supported_formats)}"
                )

            try:
                with data_store.open(path, "wb") as f:
                    GEO_WRITERS[suffix](data, f, **kwargs)
            except Exception as e:
                raise ValueError(f"Error writing GeoDataFrame: {str(e)}")

        else:  # pandas DataFrame
            if suffix not in PANDAS_WRITERS:
                supported_formats = sorted(PANDAS_WRITERS.keys())
                raise ValueError(
                    f"Unsupported file type for DataFrame: {suffix}\n"
                    f"Supported formats: {', '.join(supported_formats)}"
                )

            try:
                with data_store.open(path, mode) as f:
                    PANDAS_WRITERS[suffix](data, f, **kwargs)
            except Exception as e:
                raise ValueError(f"Error writing DataFrame: {str(e)}")

    except Exception as e:
        if isinstance(e, (TypeError, ValueError)):
            raise
        raise RuntimeError(f"Unexpected error writing dataset: {str(e)}")

write_datasets(data_dict, data_store, **kwargs)

Write multiple datasets to data storage at once.

Parameters:

data_dict : dict Dictionary mapping paths to DataFrames/GeoDataFrames. data_store : DataStore Instance of DataStore for accessing data storage. **kwargs : dict Additional arguments passed to write_dataset.

Raises:

ValueError If there are any errors writing the datasets.

Source code in gigaspatial/core/io/writers.py
def write_datasets(data_dict, data_store: DataStore, **kwargs):
    """
    Write multiple datasets to data storage at once.

    Parameters:
    ----------
    data_dict : dict
        Dictionary mapping paths to DataFrames/GeoDataFrames.
    data_store : DataStore
        Instance of DataStore for accessing data storage.
    **kwargs : dict
        Additional arguments passed to write_dataset.

    Raises:
    ------
    ValueError
        If there are any errors writing the datasets.
    """
    errors = {}

    for path, data in data_dict.items():
        try:
            write_dataset(data, data_store, path, **kwargs)
        except Exception as e:
            errors[path] = str(e)

    if errors:
        error_msg = "\n".join(f"- {path}: {error}" for path, error in errors.items())
        raise ValueError(f"Errors writing datasets:\n{error_msg}")