Skip to content

Generators Module

gigaspatial.generators

poi

base

PoiViewGenerator

Bases: ABC

Base class for generating views from Points of Interest (POI) datasets.

This class provides the structure for processing downloaded data sources and mapping them to POI data. Concrete implementations should extend this class for specific data sources.

Source code in gigaspatial/generators/poi/base.py
class PoiViewGenerator(ABC):
    """
    Base class for generating views from Points of Interest (POI) datasets.

    This class provides the structure for processing downloaded data sources
    and mapping them to POI data. Concrete implementations should extend this
    class for specific data sources.
    """

    def __init__(
        self,
        data_config: Optional[Any] = None,
        generator_config: Optional["PoiViewGeneratorConfig"] = None,
        data_store: Optional["DataStore"] = None,
    ):
        """
        Initialize the POI View Generator.

        Args:
            generator_config: Configuration for the view generator
            data_store: Data store for reading/writing data
        """
        self.data_config = data_config
        self.generator_config = generator_config or PoiViewGeneratorConfig()
        self.data_store = data_store or LocalDataStore()
        self.logger = config.get_logger(self.__class__.__name__)

    def resolve_source_paths(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        Resolve source data paths based on POI data or explicit paths.

        This method allows generators to dynamically determine source paths
        based on the POI data (e.g., by geographic intersection).

        Args:
            poi_data: POI data that may be used to determine relevant source paths
            explicit_paths: Explicitly provided source paths, if any
            **kwargs: Additional parameters for path resolution

        Returns:
            List of resolved source paths

        Notes:
            Default implementation returns explicit_paths if provided.
            Subclasses should override this to implement dynamic path resolution.
        """
        if explicit_paths is not None:
            if isinstance(explicit_paths, (str, Path)):
                return [explicit_paths]
            return list(explicit_paths)

        # Raises NotImplementedError if no explicit paths
        # and subclass hasn't overridden this method
        raise NotImplementedError(
            "This generator requires explicit source paths or a subclass "
            "implementation of resolve_source_paths()"
        )

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Hook called before loading data"""
        return source_data_path

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Hook called after loading data"""
        return data

    @abstractmethod
    def load_data(self, source_data_path: List[Union[str, Path]], **kwargs) -> Any:
        """
        Load source data for POI processing. This method handles diverse source data formats.

        Args:
            source_data_path: List of source paths
            **kwargs: Additional parameters for data loading

        Returns:
            Data in its source format (DataFrame, GeoDataFrame, TifProcessor, etc.)
        """
        pass

    def process_data(self, data: Any, **kwargs) -> Any:
        """Process the source data to prepare it for POI view generation."""
        return data
        raise NotImplementedError("Subclasses must implement this method...")

    @abstractmethod
    def map_to_poi(
        self,
        processed_data: Any,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        **kwargs,
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Map processed data to POI data.

        Args:
            processed_data: Processed source data as a GeoDataFrame
            poi_data: POI data to map to
            **kwargs: Additional mapping parameters

        Returns:
            (Geo)DataFrame with POI data mapped to source data
        """
        pass

    def generate_poi_view(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        source_data_path: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
        custom_pipeline: bool = False,
        **kwargs,
    ) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
        """
        Generate a POI view by running the complete pipeline.

        This method has been updated to make source_data_path optional.
        If not provided, it will use resolve_source_paths() to determine paths.

        Args:
            poi_data: POI data to map to
            source_data_path: Optional explicit path(s) to the source data
            **kwargs: Additional parameters for the pipeline

        Returns:
            DataFrame with the generated POI view
        """
        if custom_pipeline:
            return self._custom_pipeline(source_data_path, poi_data, **kwargs)

        self.logger.info("Starting POI view generation pipeline")

        # Resolve source paths if not explicitly provided
        resolved_paths = self.resolve_source_paths(poi_data, source_data_path, **kwargs)

        if not resolved_paths:
            self.logger.warning(
                "No source data paths resolved. Returning original POI data."
            )
            return poi_data

        # load data from resolved sources
        source_data = self.load_data(resolved_paths, **kwargs)
        self.logger.info("Source data loaded successfully")

        # process the data
        processed_data = self.process_data(source_data, **kwargs)
        self.logger.info("Data processing completed")

        # map to POI
        poi_view = self.map_to_poi(processed_data, poi_data, **kwargs)
        self.logger.info("POI mapping completed")

        return poi_view

    def save_poi_view(
        self,
        poi_view: Union[pd.DataFrame, gpd.GeoDataFrame],
        output_path: Union[Path, str],
        **kwargs,
    ) -> None:
        """
        Save the generated POI view to the data store.

        Args:
            poi_view: The POI view DataFrame to save
            output_path: Path where the POI view will be saved in DataStore
            **kwargs: Additional parameters for saving
        """
        self.logger.info(f"Saving POI view to {output_path}")
        write_dataset(poi_view, self.data_store, output_path, **kwargs)
__init__(data_config=None, generator_config=None, data_store=None)

Initialize the POI View Generator.

Parameters:

Name Type Description Default
generator_config Optional[PoiViewGeneratorConfig]

Configuration for the view generator

None
data_store Optional[DataStore]

Data store for reading/writing data

None
Source code in gigaspatial/generators/poi/base.py
def __init__(
    self,
    data_config: Optional[Any] = None,
    generator_config: Optional["PoiViewGeneratorConfig"] = None,
    data_store: Optional["DataStore"] = None,
):
    """
    Initialize the POI View Generator.

    Args:
        generator_config: Configuration for the view generator
        data_store: Data store for reading/writing data
    """
    self.data_config = data_config
    self.generator_config = generator_config or PoiViewGeneratorConfig()
    self.data_store = data_store or LocalDataStore()
    self.logger = config.get_logger(self.__class__.__name__)
generate_poi_view(poi_data, source_data_path=None, custom_pipeline=False, **kwargs)

Generate a POI view by running the complete pipeline.

This method has been updated to make source_data_path optional. If not provided, it will use resolve_source_paths() to determine paths.

Parameters:

Name Type Description Default
poi_data Union[DataFrame, GeoDataFrame]

POI data to map to

required
source_data_path Optional[Union[Path, str, List[Union[str, Path]]]]

Optional explicit path(s) to the source data

None
**kwargs

Additional parameters for the pipeline

{}

Returns:

Type Description
Union[DataFrame, GeoDataFrame]

DataFrame with the generated POI view

Source code in gigaspatial/generators/poi/base.py
def generate_poi_view(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    source_data_path: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
    custom_pipeline: bool = False,
    **kwargs,
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
    """
    Generate a POI view by running the complete pipeline.

    This method has been updated to make source_data_path optional.
    If not provided, it will use resolve_source_paths() to determine paths.

    Args:
        poi_data: POI data to map to
        source_data_path: Optional explicit path(s) to the source data
        **kwargs: Additional parameters for the pipeline

    Returns:
        DataFrame with the generated POI view
    """
    if custom_pipeline:
        return self._custom_pipeline(source_data_path, poi_data, **kwargs)

    self.logger.info("Starting POI view generation pipeline")

    # Resolve source paths if not explicitly provided
    resolved_paths = self.resolve_source_paths(poi_data, source_data_path, **kwargs)

    if not resolved_paths:
        self.logger.warning(
            "No source data paths resolved. Returning original POI data."
        )
        return poi_data

    # load data from resolved sources
    source_data = self.load_data(resolved_paths, **kwargs)
    self.logger.info("Source data loaded successfully")

    # process the data
    processed_data = self.process_data(source_data, **kwargs)
    self.logger.info("Data processing completed")

    # map to POI
    poi_view = self.map_to_poi(processed_data, poi_data, **kwargs)
    self.logger.info("POI mapping completed")

    return poi_view
load_data(source_data_path, **kwargs) abstractmethod

Load source data for POI processing. This method handles diverse source data formats.

Parameters:

Name Type Description Default
source_data_path List[Union[str, Path]]

List of source paths

required
**kwargs

Additional parameters for data loading

{}

Returns:

Type Description
Any

Data in its source format (DataFrame, GeoDataFrame, TifProcessor, etc.)

Source code in gigaspatial/generators/poi/base.py
@abstractmethod
def load_data(self, source_data_path: List[Union[str, Path]], **kwargs) -> Any:
    """
    Load source data for POI processing. This method handles diverse source data formats.

    Args:
        source_data_path: List of source paths
        **kwargs: Additional parameters for data loading

    Returns:
        Data in its source format (DataFrame, GeoDataFrame, TifProcessor, etc.)
    """
    pass
map_to_poi(processed_data, poi_data, **kwargs) abstractmethod

Map processed data to POI data.

Parameters:

Name Type Description Default
processed_data Any

Processed source data as a GeoDataFrame

required
poi_data Union[DataFrame, GeoDataFrame]

POI data to map to

required
**kwargs

Additional mapping parameters

{}

Returns:

Type Description
Union[DataFrame, GeoDataFrame]

(Geo)DataFrame with POI data mapped to source data

Source code in gigaspatial/generators/poi/base.py
@abstractmethod
def map_to_poi(
    self,
    processed_data: Any,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    **kwargs,
) -> Union[pd.DataFrame, gpd.GeoDataFrame]:
    """
    Map processed data to POI data.

    Args:
        processed_data: Processed source data as a GeoDataFrame
        poi_data: POI data to map to
        **kwargs: Additional mapping parameters

    Returns:
        (Geo)DataFrame with POI data mapped to source data
    """
    pass
process_data(data, **kwargs)

Process the source data to prepare it for POI view generation.

Source code in gigaspatial/generators/poi/base.py
def process_data(self, data: Any, **kwargs) -> Any:
    """Process the source data to prepare it for POI view generation."""
    return data
    raise NotImplementedError("Subclasses must implement this method...")
resolve_source_paths(poi_data, explicit_paths=None, **kwargs)

Resolve source data paths based on POI data or explicit paths.

This method allows generators to dynamically determine source paths based on the POI data (e.g., by geographic intersection).

Parameters:

Name Type Description Default
poi_data Union[DataFrame, GeoDataFrame]

POI data that may be used to determine relevant source paths

required
explicit_paths Optional[Union[Path, str, List[Union[str, Path]]]]

Explicitly provided source paths, if any

None
**kwargs

Additional parameters for path resolution

{}

Returns:

Type Description
List[Union[str, Path]]

List of resolved source paths

Notes

Default implementation returns explicit_paths if provided. Subclasses should override this to implement dynamic path resolution.

Source code in gigaspatial/generators/poi/base.py
def resolve_source_paths(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
    **kwargs,
) -> List[Union[str, Path]]:
    """
    Resolve source data paths based on POI data or explicit paths.

    This method allows generators to dynamically determine source paths
    based on the POI data (e.g., by geographic intersection).

    Args:
        poi_data: POI data that may be used to determine relevant source paths
        explicit_paths: Explicitly provided source paths, if any
        **kwargs: Additional parameters for path resolution

    Returns:
        List of resolved source paths

    Notes:
        Default implementation returns explicit_paths if provided.
        Subclasses should override this to implement dynamic path resolution.
    """
    if explicit_paths is not None:
        if isinstance(explicit_paths, (str, Path)):
            return [explicit_paths]
        return list(explicit_paths)

    # Raises NotImplementedError if no explicit paths
    # and subclass hasn't overridden this method
    raise NotImplementedError(
        "This generator requires explicit source paths or a subclass "
        "implementation of resolve_source_paths()"
    )
save_poi_view(poi_view, output_path, **kwargs)

Save the generated POI view to the data store.

Parameters:

Name Type Description Default
poi_view Union[DataFrame, GeoDataFrame]

The POI view DataFrame to save

required
output_path Union[Path, str]

Path where the POI view will be saved in DataStore

required
**kwargs

Additional parameters for saving

{}
Source code in gigaspatial/generators/poi/base.py
def save_poi_view(
    self,
    poi_view: Union[pd.DataFrame, gpd.GeoDataFrame],
    output_path: Union[Path, str],
    **kwargs,
) -> None:
    """
    Save the generated POI view to the data store.

    Args:
        poi_view: The POI view DataFrame to save
        output_path: Path where the POI view will be saved in DataStore
        **kwargs: Additional parameters for saving
    """
    self.logger.info(f"Saving POI view to {output_path}")
    write_dataset(poi_view, self.data_store, output_path, **kwargs)
PoiViewGeneratorConfig

Configuration for POI view generation.

Source code in gigaspatial/generators/poi/base.py
@dataclass
class PoiViewGeneratorConfig:
    """Configuration for POI view generation."""

    base_path: Path = Field(default=config.get_path("poi", "views"))
    n_workers: int = 4

ghsl_built_s

GhslBuiltSurfacePoiViewGenerator

Bases: PoiViewGenerator

Generate POI views from GHSL Built Surface.

Source code in gigaspatial/generators/poi/ghsl_built_s.py
class GhslBuiltSurfacePoiViewGenerator(PoiViewGenerator):
    """Generate POI views from GHSL Built Surface."""

    def __init__(
        self,
        data_config: Optional[GHSLDataConfig] = None,
        generator_config: Optional[PoiViewGeneratorConfig] = None,
        data_store: Optional[DataStore] = None,
    ):
        super().__init__(generator_config=generator_config, data_store=data_store)
        self.data_config = data_config or GHSLDataConfig(
            product="GHS_BUILT_S", year=2020, resolution=100, coord_system=4326
        )

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Pre-processing before loading data files."""

        # Convert single path to list for consistent handling
        if isinstance(source_data_path, (Path, str)):
            source_data_path = [source_data_path]

        # change source suffix, .zip, to .tif and paths to string
        source_data_path = [
            str(file_path.with_suffix(".tif")) for file_path in source_data_path
        ]

        # Validate all paths exist
        for file_path in source_data_path:
            if not self.data_store.file_exists(file_path):
                raise RuntimeError(
                    f"Source raster does not exist in the data store: {file_path}"
                )

        self.logger.info(
            f"Pre-loading validation complete for {len(source_data_path)} files"
        )
        return source_data_path

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Post-processing after loading data files."""
        if not data:
            self.logger.warning("No data was loaded from the source files")
            return data

        self.logger.info(
            f"Post-load processing complete. {len(data)} valid TifProcessors."
        )
        return data

    def resolve_source_paths(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        For GHSL Built Surface rasters, resolve source data paths based on POI data geography.

        Returns:
            List of paths to relevant GHSL BUILT S tile rasters
        """
        if explicit_paths is not None:
            if isinstance(explicit_paths, (str, Path)):
                return [explicit_paths]
            return list(explicit_paths)

        # Convert to GeoDataFrame if needed
        if isinstance(poi_data, pd.DataFrame):
            poi_data = convert_to_geodataframe(poi_data)

        # Find intersecting tiles
        intersection_tiles = self.data_config.get_intersecting_tiles(
            geometry=poi_data, crs=poi_data.crs
        )

        if not intersection_tiles:
            self.logger.warning("There are no matching GHSL tiles for the POI data")
            return []

        # Generate paths for each intersecting tile
        source_data_paths = [
            self.data_config.get_tile_path(tile_id=tile) for tile in intersection_tiles
        ]

        self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
        return source_data_paths

    def load_data(
        self, source_data_path: Union[Path, List[Path]], **kwargs
    ) -> List[TifProcessor]:
        """
        Load GHSL Built Surface rasters into TifProcessors from paths.

        Args:
            source_data_path: Path(s) to the source data
            **kwargs: Additional loading parameters

        Returns:
            List of TifProcessors with built surface data
        """

        processed_paths = self._pre_load_hook(source_data_path, **kwargs)

        tif_processors = [
            TifProcessor(data_path, self.data_store, mode="single")
            for data_path in processed_paths
        ]

        return self._post_load_hook(tif_processors)

    def map_to_poi(
        self,
        processed_data: List[TifProcessor],
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        map_radius_meters: float,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Map from TifProcessors to POI data.

        Args:
            processed_data: TifProcessors
            poi_data: POI data to map to
            **kwargs: Additional mapping parameters

        Returns:
            DataFrame with POI data and built surface information
        """

        # Convert to GeoDataFrame if needed
        if not isinstance(poi_data, gpd.GeoDataFrame):
            gdf_points = convert_to_geodataframe(poi_data)
        else:
            gdf_points = poi_data

        gdf_points = gdf_points.to_crs(self.data_config.crs)

        polygon_list = buffer_geodataframe(
            gdf_points, buffer_distance_meters=map_radius_meters, cap_style="round"
        ).geometry

        sampled_values = sample_multiple_tifs_by_polygons(
            tif_processors=processed_data, polygon_list=polygon_list, stat="sum"
        )

        poi_data["built_surface_m2"] = sampled_values

        return poi_data

    def generate_view(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        source_data_path: Optional[Union[Path, List[Path]]] = None,
        map_radius_meters: float = 150,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Generate POI view from GHSL Built Surface.

        Returns:
            Enhanced POI data with Built Surface information
        """
        self.logger.info("Generating GHSL Built Surface POI view")

        return self.generate_poi_view(
            poi_data=poi_data,
            source_data_path=source_data_path,
            map_radius_meters=map_radius_meters,
            **kwargs,
        )
generate_view(poi_data, source_data_path=None, map_radius_meters=150, **kwargs)

Generate POI view from GHSL Built Surface.

Returns:

Type Description
DataFrame

Enhanced POI data with Built Surface information

Source code in gigaspatial/generators/poi/ghsl_built_s.py
def generate_view(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    source_data_path: Optional[Union[Path, List[Path]]] = None,
    map_radius_meters: float = 150,
    **kwargs,
) -> pd.DataFrame:
    """
    Generate POI view from GHSL Built Surface.

    Returns:
        Enhanced POI data with Built Surface information
    """
    self.logger.info("Generating GHSL Built Surface POI view")

    return self.generate_poi_view(
        poi_data=poi_data,
        source_data_path=source_data_path,
        map_radius_meters=map_radius_meters,
        **kwargs,
    )
load_data(source_data_path, **kwargs)

Load GHSL Built Surface rasters into TifProcessors from paths.

Parameters:

Name Type Description Default
source_data_path Union[Path, List[Path]]

Path(s) to the source data

required
**kwargs

Additional loading parameters

{}

Returns:

Type Description
List[TifProcessor]

List of TifProcessors with built surface data

Source code in gigaspatial/generators/poi/ghsl_built_s.py
def load_data(
    self, source_data_path: Union[Path, List[Path]], **kwargs
) -> List[TifProcessor]:
    """
    Load GHSL Built Surface rasters into TifProcessors from paths.

    Args:
        source_data_path: Path(s) to the source data
        **kwargs: Additional loading parameters

    Returns:
        List of TifProcessors with built surface data
    """

    processed_paths = self._pre_load_hook(source_data_path, **kwargs)

    tif_processors = [
        TifProcessor(data_path, self.data_store, mode="single")
        for data_path in processed_paths
    ]

    return self._post_load_hook(tif_processors)
map_to_poi(processed_data, poi_data, map_radius_meters, **kwargs)

Map from TifProcessors to POI data.

Parameters:

Name Type Description Default
processed_data List[TifProcessor]

TifProcessors

required
poi_data Union[DataFrame, GeoDataFrame]

POI data to map to

required
**kwargs

Additional mapping parameters

{}

Returns:

Type Description
DataFrame

DataFrame with POI data and built surface information

Source code in gigaspatial/generators/poi/ghsl_built_s.py
def map_to_poi(
    self,
    processed_data: List[TifProcessor],
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    map_radius_meters: float,
    **kwargs,
) -> pd.DataFrame:
    """
    Map from TifProcessors to POI data.

    Args:
        processed_data: TifProcessors
        poi_data: POI data to map to
        **kwargs: Additional mapping parameters

    Returns:
        DataFrame with POI data and built surface information
    """

    # Convert to GeoDataFrame if needed
    if not isinstance(poi_data, gpd.GeoDataFrame):
        gdf_points = convert_to_geodataframe(poi_data)
    else:
        gdf_points = poi_data

    gdf_points = gdf_points.to_crs(self.data_config.crs)

    polygon_list = buffer_geodataframe(
        gdf_points, buffer_distance_meters=map_radius_meters, cap_style="round"
    ).geometry

    sampled_values = sample_multiple_tifs_by_polygons(
        tif_processors=processed_data, polygon_list=polygon_list, stat="sum"
    )

    poi_data["built_surface_m2"] = sampled_values

    return poi_data
resolve_source_paths(poi_data, explicit_paths=None, **kwargs)

For GHSL Built Surface rasters, resolve source data paths based on POI data geography.

Returns:

Type Description
List[Union[str, Path]]

List of paths to relevant GHSL BUILT S tile rasters

Source code in gigaspatial/generators/poi/ghsl_built_s.py
def resolve_source_paths(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
    **kwargs,
) -> List[Union[str, Path]]:
    """
    For GHSL Built Surface rasters, resolve source data paths based on POI data geography.

    Returns:
        List of paths to relevant GHSL BUILT S tile rasters
    """
    if explicit_paths is not None:
        if isinstance(explicit_paths, (str, Path)):
            return [explicit_paths]
        return list(explicit_paths)

    # Convert to GeoDataFrame if needed
    if isinstance(poi_data, pd.DataFrame):
        poi_data = convert_to_geodataframe(poi_data)

    # Find intersecting tiles
    intersection_tiles = self.data_config.get_intersecting_tiles(
        geometry=poi_data, crs=poi_data.crs
    )

    if not intersection_tiles:
        self.logger.warning("There are no matching GHSL tiles for the POI data")
        return []

    # Generate paths for each intersecting tile
    source_data_paths = [
        self.data_config.get_tile_path(tile_id=tile) for tile in intersection_tiles
    ]

    self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
    return source_data_paths

ghsl_smod

GhslSmodPoiViewGenerator

Bases: PoiViewGenerator

Generate POI views from GHSL Settlement Model (SMOD).

Source code in gigaspatial/generators/poi/ghsl_smod.py
class GhslSmodPoiViewGenerator(PoiViewGenerator):
    """Generate POI views from GHSL Settlement Model (SMOD)."""

    def __init__(
        self,
        data_config: Optional[GHSLDataConfig] = None,
        generator_config: Optional[PoiViewGeneratorConfig] = None,
        data_store: Optional[DataStore] = None,
    ):
        super().__init__(generator_config=generator_config, data_store=data_store)
        self.data_config = data_config or GHSLDataConfig(
            product="GHS_SMOD", year=2020, resolution=1000, coord_system=54009
        )

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Pre-processing before loading data files."""

        # Convert single path to list for consistent handling
        if isinstance(source_data_path, (Path, str)):
            source_data_path = [source_data_path]

        # change source suffix, .zip, to .tif and paths to string
        source_data_path = [
            str(file_path.with_suffix(".tif")) for file_path in source_data_path
        ]

        # Validate all paths exist
        for file_path in source_data_path:
            if not self.data_store.file_exists(file_path):
                raise RuntimeError(
                    f"Source raster does not exist in the data store: {file_path}"
                )

        self.logger.info(
            f"Pre-loading validation complete for {len(source_data_path)} files"
        )
        return source_data_path

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Post-processing after loading data files."""
        if not data:
            self.logger.warning("No data was loaded from the source files")
            return data

        self.logger.info(
            f"Post-load processing complete. {len(data)} valid TifProcessors."
        )
        return data

    def resolve_source_paths(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        For GHSL SMOD rasters, resolve source data paths based on POI data geography.

        Returns:
            List of paths to relevant GHSL SMOD tile rasters
        """
        if explicit_paths is not None:
            if isinstance(explicit_paths, (str, Path)):
                return [explicit_paths]
            return list(explicit_paths)

        # Convert to GeoDataFrame if needed
        if isinstance(poi_data, pd.DataFrame):
            poi_data = convert_to_geodataframe(poi_data)

        # Find intersecting tiles
        intersection_tiles = self.data_config.get_intersecting_tiles(
            geometry=poi_data, crs=poi_data.crs
        )

        if not intersection_tiles:
            self.logger.warning("There are no matching GHSL tiles for the POI data")
            return []

        # Generate paths for each intersecting tile
        source_data_paths = [
            self.data_config.get_tile_path(tile_id=tile) for tile in intersection_tiles
        ]

        self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
        return source_data_paths

    def load_data(
        self, source_data_path: Union[Path, List[Path]], **kwargs
    ) -> List[TifProcessor]:
        """
        Load GHSL SMOD rasters into TifProcessors from paths.

        Args:
            source_data_path: Path(s) to the source data
            **kwargs: Additional loading parameters

        Returns:
            List of TifProcessors with settlement model data
        """

        processed_paths = self._pre_load_hook(source_data_path, **kwargs)

        tif_processors = [
            TifProcessor(data_path, self.data_store, mode="single")
            for data_path in processed_paths
        ]

        return self._post_load_hook(tif_processors)

    def map_to_poi(
        self,
        processed_data: List[TifProcessor],
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        **kwargs,
    ) -> pd.DataFrame:
        """
        Map from TifProcessors to POI data.

        Args:
            processed_data: TifProcessors
            poi_data: POI data to map to
            **kwargs: Additional mapping parameters

        Returns:
            DataFrame with POI data and SMOD classification information
        """

        # Convert to GeoDataFrame if needed
        if not isinstance(poi_data, gpd.GeoDataFrame):
            gdf_points = convert_to_geodataframe(poi_data)
        else:
            gdf_points = poi_data

        gdf_points = gdf_points.to_crs(self.data_config.crs)

        coord_list = [
            (x, y) for x, y in zip(gdf_points["geometry"].x, gdf_points["geometry"].y)
        ]

        sampled_values = sample_multiple_tifs_by_coordinates(
            tif_processors=processed_data, coordinate_list=coord_list
        )

        poi_data["smod_class"] = sampled_values

        return poi_data

    def generate_view(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        source_data_path: Optional[Union[Path, List[Path]]] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Generate POI view from GHSL Settlement Model.

        Returns:
            Enhanced POI data with SMOD classification
        """
        self.logger.info("Generating GHSL Settlement Model POI view")

        return self.generate_poi_view(
            poi_data=poi_data,
            source_data_path=source_data_path,
            **kwargs,
        )
generate_view(poi_data, source_data_path=None, **kwargs)

Generate POI view from GHSL Settlement Model.

Returns:

Type Description
DataFrame

Enhanced POI data with SMOD classification

Source code in gigaspatial/generators/poi/ghsl_smod.py
def generate_view(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    source_data_path: Optional[Union[Path, List[Path]]] = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Generate POI view from GHSL Settlement Model.

    Returns:
        Enhanced POI data with SMOD classification
    """
    self.logger.info("Generating GHSL Settlement Model POI view")

    return self.generate_poi_view(
        poi_data=poi_data,
        source_data_path=source_data_path,
        **kwargs,
    )
load_data(source_data_path, **kwargs)

Load GHSL SMOD rasters into TifProcessors from paths.

Parameters:

Name Type Description Default
source_data_path Union[Path, List[Path]]

Path(s) to the source data

required
**kwargs

Additional loading parameters

{}

Returns:

Type Description
List[TifProcessor]

List of TifProcessors with settlement model data

Source code in gigaspatial/generators/poi/ghsl_smod.py
def load_data(
    self, source_data_path: Union[Path, List[Path]], **kwargs
) -> List[TifProcessor]:
    """
    Load GHSL SMOD rasters into TifProcessors from paths.

    Args:
        source_data_path: Path(s) to the source data
        **kwargs: Additional loading parameters

    Returns:
        List of TifProcessors with settlement model data
    """

    processed_paths = self._pre_load_hook(source_data_path, **kwargs)

    tif_processors = [
        TifProcessor(data_path, self.data_store, mode="single")
        for data_path in processed_paths
    ]

    return self._post_load_hook(tif_processors)
map_to_poi(processed_data, poi_data, **kwargs)

Map from TifProcessors to POI data.

Parameters:

Name Type Description Default
processed_data List[TifProcessor]

TifProcessors

required
poi_data Union[DataFrame, GeoDataFrame]

POI data to map to

required
**kwargs

Additional mapping parameters

{}

Returns:

Type Description
DataFrame

DataFrame with POI data and SMOD classification information

Source code in gigaspatial/generators/poi/ghsl_smod.py
def map_to_poi(
    self,
    processed_data: List[TifProcessor],
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    **kwargs,
) -> pd.DataFrame:
    """
    Map from TifProcessors to POI data.

    Args:
        processed_data: TifProcessors
        poi_data: POI data to map to
        **kwargs: Additional mapping parameters

    Returns:
        DataFrame with POI data and SMOD classification information
    """

    # Convert to GeoDataFrame if needed
    if not isinstance(poi_data, gpd.GeoDataFrame):
        gdf_points = convert_to_geodataframe(poi_data)
    else:
        gdf_points = poi_data

    gdf_points = gdf_points.to_crs(self.data_config.crs)

    coord_list = [
        (x, y) for x, y in zip(gdf_points["geometry"].x, gdf_points["geometry"].y)
    ]

    sampled_values = sample_multiple_tifs_by_coordinates(
        tif_processors=processed_data, coordinate_list=coord_list
    )

    poi_data["smod_class"] = sampled_values

    return poi_data
resolve_source_paths(poi_data, explicit_paths=None, **kwargs)

For GHSL SMOD rasters, resolve source data paths based on POI data geography.

Returns:

Type Description
List[Union[str, Path]]

List of paths to relevant GHSL SMOD tile rasters

Source code in gigaspatial/generators/poi/ghsl_smod.py
def resolve_source_paths(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
    **kwargs,
) -> List[Union[str, Path]]:
    """
    For GHSL SMOD rasters, resolve source data paths based on POI data geography.

    Returns:
        List of paths to relevant GHSL SMOD tile rasters
    """
    if explicit_paths is not None:
        if isinstance(explicit_paths, (str, Path)):
            return [explicit_paths]
        return list(explicit_paths)

    # Convert to GeoDataFrame if needed
    if isinstance(poi_data, pd.DataFrame):
        poi_data = convert_to_geodataframe(poi_data)

    # Find intersecting tiles
    intersection_tiles = self.data_config.get_intersecting_tiles(
        geometry=poi_data, crs=poi_data.crs
    )

    if not intersection_tiles:
        self.logger.warning("There are no matching GHSL tiles for the POI data")
        return []

    # Generate paths for each intersecting tile
    source_data_paths = [
        self.data_config.get_tile_path(tile_id=tile) for tile in intersection_tiles
    ]

    self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
    return source_data_paths

google_open_buildings

GoogleBuildingsPoiViewGenerator

Bases: PoiViewGenerator

Generate POI views from Google Open Buildings data.

Source code in gigaspatial/generators/poi/google_open_buildings.py
class GoogleBuildingsPoiViewGenerator(PoiViewGenerator):
    """Generate POI views from Google Open Buildings data."""

    def __init__(
        self,
        data_config: Optional[GoogleOpenBuildingsConfig] = None,
        generator_config: Optional[PoiViewGeneratorConfig] = None,
        data_store: Optional[DataStore] = None,
    ):
        super().__init__(generator_config=generator_config, data_store=data_store)
        self.data_config = data_config or GoogleOpenBuildingsConfig()
        self.handler = GoogleOpenBuildingsDownloader(
            config=self.data_config, data_store=self.data_store
        )

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Pre-processing before loading data files."""

        # Convert single path to list for consistent handling
        if isinstance(source_data_path, (Path, str)):
            source_data_path = [source_data_path]

        source_data_path = [str(file_path) for file_path in source_data_path]

        # Validate all paths exist
        for file_path in source_data_path:
            if not self.handler.data_store.file_exists(file_path):
                raise RuntimeError(
                    f"Source buildings file does not exist in the data store: {file_path}"
                )

        self.logger.info(
            f"Pre-loading validation complete for {len(source_data_path)} files"
        )
        return source_data_path

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Post-processing after loading data files."""
        if data.empty:
            self.logger.warning("No data was loaded from the source files")
            return data

        self.logger.info(
            f"Post-load processing complete. {len(data)} valid building records."
        )
        return data

    def resolve_source_paths(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        For Google Open Buildings, resolve source data paths based on POI data geography. This determines which tile files
        intersect with the POI data's geographic extent.
        """
        if explicit_paths is not None:
            if isinstance(explicit_paths, (str, Path)):
                return [explicit_paths]
            return list(explicit_paths)

        # Convert to GeoDataFrame if needed
        if not isinstance(poi_data, gpd.GeoDataFrame):
            poi_data = convert_to_geodataframe(poi_data)

        # Find intersecting tiles
        intersection_tiles = self.handler._get_intersecting_tiles(poi_data)

        if intersection_tiles.empty:
            self.logger.warning(
                "There are no matching Google buildings tiles for the POI data"
            )
            return []

        # Generate paths for each intersecting tile
        source_data_paths = [
            self.data_config.get_tile_path(tile_id=tile, data_type="points")
            for tile in intersection_tiles.tile_id
        ]

        self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
        return source_data_paths

    def load_data(
        self, source_data_path: Union[Path, List[Path]], **kwargs
    ) -> pd.DataFrame:
        """
        Load building data from paths.

        Args:
            source_data_path: Path(s) to the source data
            **kwargs: Additional loading parameters

        Returns:
            DataFrame containing building data
        """

        processed_paths = self._pre_load_hook(source_data_path, **kwargs)

        all_data = []
        for file_path in processed_paths:
            all_data.append(read_dataset(self.handler.data_store, file_path))

        if not all_data:
            return pd.DataFrame()

        # Concatenate all tile data
        result = pd.concat(all_data, ignore_index=True)

        return self._post_load_hook(result)

    def map_to_poi(
        self, processed_data: pd.DataFrame, poi_data: pd.DataFrame, **kwargs
    ) -> pd.DataFrame:
        """
        Map processed building data to POI data.

        Args:
            processed_data: Processed building data as GeoDataFrame
            poi_data: POI data to map to
            **kwargs: Additional mapping parameters

        Returns:
            DataFrame with POI data and nearest building information
        """

        tree = cKDTree(processed_data[["latitude", "longitude"]])

        if "latitude" not in poi_data:
            poi_lat_col, poi_lon_col = detect_coordinate_columns(poi_data)
            df_points = poi_data.rename(
                columns={poi_lat_col: "latitude", poi_lon_col: "longitude"}
            )
        else:
            df_points = poi_data.copy()

        _, idx = tree.query(df_points[["latitude", "longitude"]], k=1)

        df_nearest_buildings = processed_data.iloc[idx]

        dist = calculate_distance(
            lat1=df_points.latitude,
            lon1=df_points.longitude,
            lat2=df_nearest_buildings.latitude,
            lon2=df_nearest_buildings.longitude,
        )

        poi_data["nearest_google_building_id"] = (
            df_nearest_buildings.full_plus_code.to_numpy()
        )
        poi_data["nearest_google_building_distance"] = dist

        return poi_data

    def generate_view(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        source_data_path: Optional[Union[Path, List[Path]]] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Generate POI view for Google Open Buildings.

        Returns:
            Enhanced POI data with nearest building information
        """
        self.logger.info("Generating Google Open Buildings POI view")

        return self.generate_poi_view(
            poi_data=poi_data,
            source_data_path=source_data_path,
            **kwargs,
        )
generate_view(poi_data, source_data_path=None, **kwargs)

Generate POI view for Google Open Buildings.

Returns:

Type Description
DataFrame

Enhanced POI data with nearest building information

Source code in gigaspatial/generators/poi/google_open_buildings.py
def generate_view(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    source_data_path: Optional[Union[Path, List[Path]]] = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Generate POI view for Google Open Buildings.

    Returns:
        Enhanced POI data with nearest building information
    """
    self.logger.info("Generating Google Open Buildings POI view")

    return self.generate_poi_view(
        poi_data=poi_data,
        source_data_path=source_data_path,
        **kwargs,
    )
load_data(source_data_path, **kwargs)

Load building data from paths.

Parameters:

Name Type Description Default
source_data_path Union[Path, List[Path]]

Path(s) to the source data

required
**kwargs

Additional loading parameters

{}

Returns:

Type Description
DataFrame

DataFrame containing building data

Source code in gigaspatial/generators/poi/google_open_buildings.py
def load_data(
    self, source_data_path: Union[Path, List[Path]], **kwargs
) -> pd.DataFrame:
    """
    Load building data from paths.

    Args:
        source_data_path: Path(s) to the source data
        **kwargs: Additional loading parameters

    Returns:
        DataFrame containing building data
    """

    processed_paths = self._pre_load_hook(source_data_path, **kwargs)

    all_data = []
    for file_path in processed_paths:
        all_data.append(read_dataset(self.handler.data_store, file_path))

    if not all_data:
        return pd.DataFrame()

    # Concatenate all tile data
    result = pd.concat(all_data, ignore_index=True)

    return self._post_load_hook(result)
map_to_poi(processed_data, poi_data, **kwargs)

Map processed building data to POI data.

Parameters:

Name Type Description Default
processed_data DataFrame

Processed building data as GeoDataFrame

required
poi_data DataFrame

POI data to map to

required
**kwargs

Additional mapping parameters

{}

Returns:

Type Description
DataFrame

DataFrame with POI data and nearest building information

Source code in gigaspatial/generators/poi/google_open_buildings.py
def map_to_poi(
    self, processed_data: pd.DataFrame, poi_data: pd.DataFrame, **kwargs
) -> pd.DataFrame:
    """
    Map processed building data to POI data.

    Args:
        processed_data: Processed building data as GeoDataFrame
        poi_data: POI data to map to
        **kwargs: Additional mapping parameters

    Returns:
        DataFrame with POI data and nearest building information
    """

    tree = cKDTree(processed_data[["latitude", "longitude"]])

    if "latitude" not in poi_data:
        poi_lat_col, poi_lon_col = detect_coordinate_columns(poi_data)
        df_points = poi_data.rename(
            columns={poi_lat_col: "latitude", poi_lon_col: "longitude"}
        )
    else:
        df_points = poi_data.copy()

    _, idx = tree.query(df_points[["latitude", "longitude"]], k=1)

    df_nearest_buildings = processed_data.iloc[idx]

    dist = calculate_distance(
        lat1=df_points.latitude,
        lon1=df_points.longitude,
        lat2=df_nearest_buildings.latitude,
        lon2=df_nearest_buildings.longitude,
    )

    poi_data["nearest_google_building_id"] = (
        df_nearest_buildings.full_plus_code.to_numpy()
    )
    poi_data["nearest_google_building_distance"] = dist

    return poi_data
resolve_source_paths(poi_data, explicit_paths=None, **kwargs)

For Google Open Buildings, resolve source data paths based on POI data geography. This determines which tile files intersect with the POI data's geographic extent.

Source code in gigaspatial/generators/poi/google_open_buildings.py
def resolve_source_paths(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
    **kwargs,
) -> List[Union[str, Path]]:
    """
    For Google Open Buildings, resolve source data paths based on POI data geography. This determines which tile files
    intersect with the POI data's geographic extent.
    """
    if explicit_paths is not None:
        if isinstance(explicit_paths, (str, Path)):
            return [explicit_paths]
        return list(explicit_paths)

    # Convert to GeoDataFrame if needed
    if not isinstance(poi_data, gpd.GeoDataFrame):
        poi_data = convert_to_geodataframe(poi_data)

    # Find intersecting tiles
    intersection_tiles = self.handler._get_intersecting_tiles(poi_data)

    if intersection_tiles.empty:
        self.logger.warning(
            "There are no matching Google buildings tiles for the POI data"
        )
        return []

    # Generate paths for each intersecting tile
    source_data_paths = [
        self.data_config.get_tile_path(tile_id=tile, data_type="points")
        for tile in intersection_tiles.tile_id
    ]

    self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
    return source_data_paths

microsoft_global_buildings

MSBuildingsPoiViewGenerator

Bases: PoiViewGenerator

Generate POI views from Microsoft Global Buildings data.

Source code in gigaspatial/generators/poi/microsoft_global_buildings.py
class MSBuildingsPoiViewGenerator(PoiViewGenerator):
    """Generate POI views from Microsoft Global Buildings data."""

    def __init__(
        self,
        data_config: Optional[MSBuildingsConfig] = None,
        generator_config: Optional[PoiViewGeneratorConfig] = None,
        data_store: Optional[DataStore] = None,
    ):
        super().__init__(generator_config=generator_config, data_store=data_store)
        self.data_config = data_config or MSBuildingsConfig(data_store=self.data_store)

    def _pre_load_hook(self, source_data_path, **kwargs) -> Any:
        """Pre-processing before loading data files."""

        # Convert single path to list for consistent handling
        if isinstance(source_data_path, (Path, str)):
            source_data_path = [source_data_path]

        source_data_path = [str(file_path) for file_path in source_data_path]

        # Validate all paths exist
        for file_path in source_data_path:
            if not self.data_store.file_exists(file_path):
                raise RuntimeError(
                    f"Source buildings file does not exist in the data store: {file_path}"
                )

        self.logger.info(
            f"Pre-loading validation complete for {len(source_data_path)} files"
        )
        return source_data_path

    def _post_load_hook(self, data, **kwargs) -> Any:
        """Post-processing after loading data files."""
        if data.empty:
            self.logger.warning("No data was loaded from the source files")
            return data

        self.logger.info(
            f"Post-load processing complete. {len(data)} valid building records."
        )
        return data

    def resolve_source_paths(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
        **kwargs,
    ) -> List[Union[str, Path]]:
        """
        For Microsoft Buildings, resolve source data paths based on POI data geography.

        Returns:
            List of paths to relevant Microsoft Buildings tile files
        """
        # Return explicit paths if provided
        if explicit_paths is not None:
            if isinstance(explicit_paths, (str, Path)):
                return [explicit_paths]
            return list(explicit_paths)

        if "latitude" not in poi_data:
            poi_lat_col, poi_lon_col = detect_coordinate_columns(poi_data)
        else:
            poi_lat_col, poi_lon_col = ("latitude", "longitude")

        points = poi_data[[poi_lat_col, poi_lon_col]].to_numpy()

        # Find intersecting tiles
        tiles = self.data_config.get_tiles_for_points(points)

        if tiles.empty:
            self.logger.warning(
                "There are no matching Microsoft Buildings tiles for the POI data"
            )
            return []

        # Generate paths for each intersecting tile
        source_data_paths = [
            self.data_config.get_tile_path(
                quadkey=tile["quadkey"],
                location=tile["country"] if tile["country"] else tile["location"],
            )
            for _, tile in tiles.iterrows()
        ]

        self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
        return source_data_paths

    def load_data(
        self, source_data_path: Union[Path, List[Path]], **kwargs
    ) -> gpd.GeoDataFrame:
        """
        Load building data from Microsoft Buildings dataset.

        Args:
            source_data_path: Path(s) to the source data files
            **kwargs: Additional loading parameters

        Returns:
            DataFrame containing building data
        """

        def read_ms_dataset(data_store: DataStore, file_path: str):
            df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
            df["geometry"] = df["geometry"].apply(shape)
            return gpd.GeoDataFrame(df, crs=4326)

        processed_paths = self._pre_load_hook(source_data_path, **kwargs)

        all_data = []
        for file_path in processed_paths:
            all_data.append(read_ms_dataset(self.data_store, file_path))

        if not all_data:
            return pd.DataFrame()

        # Concatenate all tile data
        result = pd.concat(all_data, ignore_index=True)

        return self._post_load_hook(result)

    def process_data(self, data: gpd.GeoDataFrame, **kwargs) -> pd.DataFrame:
        return data.get_coordinates()

    def map_to_poi(
        self, processed_data: pd.DataFrame, poi_data: pd.DataFrame, **kwargs
    ) -> pd.DataFrame:
        """
        Map processed building data to POI data.

        Args:
            processed_data: Processed building data as GeoDataFrame
            poi_data: POI data to map to
            **kwargs: Additional mapping parameters

        Returns:
            DataFrame with POI data and nearest building information
        """

        tree = cKDTree(processed_data[["y", "x"]])

        if "latitude" not in poi_data:
            poi_lat_col, poi_lon_col = detect_coordinate_columns(poi_data)
            df_points = poi_data.rename(
                columns={poi_lat_col: "latitude", poi_lon_col: "longitude"}
            )
        else:
            df_points = poi_data.copy()

        _, idx = tree.query(df_points[["latitude", "longitude"]], k=1)

        df_nearest_buildings = processed_data.iloc[idx]

        dist = calculate_distance(
            lat1=df_points.latitude,
            lon1=df_points.longitude,
            lat2=df_nearest_buildings.y,
            lon2=df_nearest_buildings.x,
        )

        poi_data["nearest_ms_building_id"] = df_nearest_buildings.get(
            "building_id", None
        )
        poi_data["nearest_ms_building_distance"] = dist

        return poi_data

    def generate_view(
        self,
        poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
        source_data_path: Optional[Union[Path, List[Path]]] = None,
        **kwargs,
    ) -> pd.DataFrame:
        """
        Generate POI view for Microsoft Global Buildings.

        Returns:
            Enhanced POI data with nearest building information
        """
        self.logger.info("Generating MicrosoftBuildings POI view")

        return self.generate_poi_view(
            poi_data=poi_data,
            source_data_path=source_data_path,
            **kwargs,
        )
generate_view(poi_data, source_data_path=None, **kwargs)

Generate POI view for Microsoft Global Buildings.

Returns:

Type Description
DataFrame

Enhanced POI data with nearest building information

Source code in gigaspatial/generators/poi/microsoft_global_buildings.py
def generate_view(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    source_data_path: Optional[Union[Path, List[Path]]] = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Generate POI view for Microsoft Global Buildings.

    Returns:
        Enhanced POI data with nearest building information
    """
    self.logger.info("Generating MicrosoftBuildings POI view")

    return self.generate_poi_view(
        poi_data=poi_data,
        source_data_path=source_data_path,
        **kwargs,
    )
load_data(source_data_path, **kwargs)

Load building data from Microsoft Buildings dataset.

Parameters:

Name Type Description Default
source_data_path Union[Path, List[Path]]

Path(s) to the source data files

required
**kwargs

Additional loading parameters

{}

Returns:

Type Description
GeoDataFrame

DataFrame containing building data

Source code in gigaspatial/generators/poi/microsoft_global_buildings.py
def load_data(
    self, source_data_path: Union[Path, List[Path]], **kwargs
) -> gpd.GeoDataFrame:
    """
    Load building data from Microsoft Buildings dataset.

    Args:
        source_data_path: Path(s) to the source data files
        **kwargs: Additional loading parameters

    Returns:
        DataFrame containing building data
    """

    def read_ms_dataset(data_store: DataStore, file_path: str):
        df = read_gzipped_json_or_csv(file_path=file_path, data_store=data_store)
        df["geometry"] = df["geometry"].apply(shape)
        return gpd.GeoDataFrame(df, crs=4326)

    processed_paths = self._pre_load_hook(source_data_path, **kwargs)

    all_data = []
    for file_path in processed_paths:
        all_data.append(read_ms_dataset(self.data_store, file_path))

    if not all_data:
        return pd.DataFrame()

    # Concatenate all tile data
    result = pd.concat(all_data, ignore_index=True)

    return self._post_load_hook(result)
map_to_poi(processed_data, poi_data, **kwargs)

Map processed building data to POI data.

Parameters:

Name Type Description Default
processed_data DataFrame

Processed building data as GeoDataFrame

required
poi_data DataFrame

POI data to map to

required
**kwargs

Additional mapping parameters

{}

Returns:

Type Description
DataFrame

DataFrame with POI data and nearest building information

Source code in gigaspatial/generators/poi/microsoft_global_buildings.py
def map_to_poi(
    self, processed_data: pd.DataFrame, poi_data: pd.DataFrame, **kwargs
) -> pd.DataFrame:
    """
    Map processed building data to POI data.

    Args:
        processed_data: Processed building data as GeoDataFrame
        poi_data: POI data to map to
        **kwargs: Additional mapping parameters

    Returns:
        DataFrame with POI data and nearest building information
    """

    tree = cKDTree(processed_data[["y", "x"]])

    if "latitude" not in poi_data:
        poi_lat_col, poi_lon_col = detect_coordinate_columns(poi_data)
        df_points = poi_data.rename(
            columns={poi_lat_col: "latitude", poi_lon_col: "longitude"}
        )
    else:
        df_points = poi_data.copy()

    _, idx = tree.query(df_points[["latitude", "longitude"]], k=1)

    df_nearest_buildings = processed_data.iloc[idx]

    dist = calculate_distance(
        lat1=df_points.latitude,
        lon1=df_points.longitude,
        lat2=df_nearest_buildings.y,
        lon2=df_nearest_buildings.x,
    )

    poi_data["nearest_ms_building_id"] = df_nearest_buildings.get(
        "building_id", None
    )
    poi_data["nearest_ms_building_distance"] = dist

    return poi_data
resolve_source_paths(poi_data, explicit_paths=None, **kwargs)

For Microsoft Buildings, resolve source data paths based on POI data geography.

Returns:

Type Description
List[Union[str, Path]]

List of paths to relevant Microsoft Buildings tile files

Source code in gigaspatial/generators/poi/microsoft_global_buildings.py
def resolve_source_paths(
    self,
    poi_data: Union[pd.DataFrame, gpd.GeoDataFrame],
    explicit_paths: Optional[Union[Path, str, List[Union[str, Path]]]] = None,
    **kwargs,
) -> List[Union[str, Path]]:
    """
    For Microsoft Buildings, resolve source data paths based on POI data geography.

    Returns:
        List of paths to relevant Microsoft Buildings tile files
    """
    # Return explicit paths if provided
    if explicit_paths is not None:
        if isinstance(explicit_paths, (str, Path)):
            return [explicit_paths]
        return list(explicit_paths)

    if "latitude" not in poi_data:
        poi_lat_col, poi_lon_col = detect_coordinate_columns(poi_data)
    else:
        poi_lat_col, poi_lon_col = ("latitude", "longitude")

    points = poi_data[[poi_lat_col, poi_lon_col]].to_numpy()

    # Find intersecting tiles
    tiles = self.data_config.get_tiles_for_points(points)

    if tiles.empty:
        self.logger.warning(
            "There are no matching Microsoft Buildings tiles for the POI data"
        )
        return []

    # Generate paths for each intersecting tile
    source_data_paths = [
        self.data_config.get_tile_path(
            quadkey=tile["quadkey"],
            location=tile["country"] if tile["country"] else tile["location"],
        )
        for _, tile in tiles.iterrows()
    ]

    self.logger.info(f"Resolved {len(source_data_paths)} tile paths for POI data")
    return source_data_paths