Skip to content

EarthStat Geo Data Processing Documentation

Comprehensive guide to the earthstat.geo_data_processing module functionalities, designed for geospatial data manipulation and analysis.

Raster Data Manipulation

Clipping Raster Data for Area of Interest

clipMultipleRasters(raster_paths, shapefile_path, invalid_values=None)

Clips a raster file using a shapefile, optionally filtering out specified invalid values. The clipped raster is saved in a new directory named 'clipped' plus the original file directory.

Parameters:

Name Type Description Default
raster_path str

Path to the raster file to be clipped.

required
shapefile_path str

Path to the shapefile used for clipping.

required
invalid_values list

Values in the raster to treat as invalid and replace with NaN.

None

The function creates a new directory (if it doesn't already exist) and saves the clipped raster there.

Source code in earthstat/geo_data_processing/clip_raster.py
def clipMultipleRasters(raster_paths, shapefile_path, invalid_values=None):
    """
    Clips a raster file using a shapefile, optionally filtering out specified invalid values.
    The clipped raster is saved in a new directory named 'clipped' plus the original file directory.

    Args:
        raster_path (str): Path to the raster file to be clipped.
        shapefile_path (str): Path to the shapefile used for clipping.
        invalid_values (list, optional): Values in the raster to treat as invalid and replace with NaN.

    The function creates a new directory (if it doesn't already exist) and saves the clipped raster there.
    """

    # Enhancement: by open shapefile and create dir our of the loop
    output_clip_dir = os.path.join(os.path.dirname(raster_paths[0]), 'clipped')
    os.makedirs(output_clip_dir, exist_ok=True)

    shapefile = gpd.read_file(shapefile_path)

    # Using ProcessPoolExecutor to parallelize the task
    with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:

        # Use list to force execution and tqdm for progress bar
        list(tqdm(executor.map(clipRasterWithShapefile, raster_paths, [shapefile]*len(raster_paths), [invalid_values]*len(raster_paths)),
                  total=len(raster_paths), desc="Clipping Rasters"))

    return output_clip_dir

clipRasterWithShapefile(raster_path, shapefile, invalid_values=None)

Clips multiple raster files using a single shapefile, optionally filtering out specified invalid values. Each clipped raster is saved in a new directory named 'clipped' plus the original file directory.

Parameters:

Name Type Description Default
raster_paths list of str

Paths to the raster files to be clipped.

required
shapefile_path str

Path to the shapefile used for clipping.

required
invalid_values list

Values in the raster to treat as invalid and replace with NaN.

None

Returns:

Type Description
str

The path to the directory where clipped rasters are saved.

Processes each raster sequentially, showing progress with a progress bar.

Source code in earthstat/geo_data_processing/clip_raster.py
def clipRasterWithShapefile(raster_path, shapefile, invalid_values=None):
    """
    Clips multiple raster files using a single shapefile, optionally filtering out specified invalid values.
    Each clipped raster is saved in a new directory named 'clipped' plus the original file directory.

    Args:
        raster_paths (list of str): Paths to the raster files to be clipped.
        shapefile_path (str): Path to the shapefile used for clipping.
        invalid_values (list, optional): Values in the raster to treat as invalid and replace with NaN.

    Returns:
        str: The path to the directory where clipped rasters are saved.

    Processes each raster sequentially, showing progress with a progress bar.
    """
    file_dir, file_name = os.path.split(raster_path)

    output_clip = os.path.join(file_dir, 'clipped')

    with rasterio.open(raster_path) as src:
        geoms = [mapping(shape) for shape in shapefile.geometry]
        out_image, out_transform = mask(src, geoms, crop=True)
        if invalid_values:
            for invalid_value in invalid_values:
                out_image = np.where(
                    out_image == invalid_value, np.nan, out_image)

        out_meta = src.meta.copy()
        out_meta.update({
            "driver": "GTiff",
            "height": out_image.shape[1],
            "width": out_image.shape[2],
            "transform": out_transform,
            "dtype": 'float32',
            "compress": "lzw"
        })

    output_path = os.path.join(output_clip, f"clipped_{file_name}")
    with rasterio.open(output_path, "w", **out_meta) as dest:
        dest.write(out_image)

Rescaling and Resampling for Data Uniformity

rescaleResampleMask(mask_path, raster_data_path, scale_factor=None, resampling_method='bilinear')

Rescales and resamples a mask raster based on a target raster's specifications. Optionally applies scaling to the mask data's values and resamples using a specified method.

Parameters:

Name Type Description Default
mask_path str

Path to the mask raster file to be rescaled and resampled.

required
raster_data_path str

Path to the target raster file for matching specifications.

required
scale_factor tuple

Min and max values for rescaling the mask data. Defaults to None.

None
resampling_method str

Method for resampling ('bilinear', 'cubic', 'average', 'nearest'.). Defaults to bilinear.

'bilinear'

Returns:

Type Description
str

The path to the saved rescaled and resampled raster file.

Source code in earthstat/geo_data_processing/rescale_resample_raster.py
def rescaleResampleMask(mask_path, raster_data_path, scale_factor=None, resampling_method="bilinear"):
    """
    Rescales and resamples a mask raster based on a target raster's specifications. 
    Optionally applies scaling to the mask data's values and resamples using a specified method.

    Args:
        mask_path (str): Path to the mask raster file to be rescaled and resampled.
        raster_data_path (str): Path to the target raster file for matching specifications.
        scale_factor (tuple, optional): Min and max values for rescaling the mask data. Defaults to None.
        resampling_method (str, optional): Method for resampling ('bilinear', 'cubic', 'average', 'nearest'.). Defaults to bilinear.

    Returns:
        str: The path to the saved rescaled and resampled raster file.
    """
    file_dir, file_name = savedFilePath(mask_path)

    resampling_enum = resamplingMethod(resampling_method)

    with rasterio.open(raster_data_path) as target_raster:
        target_transform = target_raster.transform

    with rasterio.open(mask_path) as mask:
        mask_data = mask.read(1)

        if scale_factor:
            # Use actual min and max from the data
            old_min, old_max = mask_data.min(), mask_data.max()
            new_min, new_max = scale_factor
            rescaled_data = ((mask_data - old_min) /
                             (old_max - old_min)) * (new_max - new_min) + new_min
            print("\nRescaling Mask...")
            print("Resampling mask...")
        else:
            rescaled_data = mask_data  # Skip rescaling if scale_factor is None
            print("\nResampling mask...")
        out_meta = mask.meta.copy()
        out_meta.update({
            "driver": "GTiff",
            "height": target_raster.height,
            "width": target_raster.width,
            "transform": target_transform,
            "crs": target_raster.crs,
            "compress": "DEFLATE",  # Future Enhancement:specify best compression scheme here
            "predictor": "2",  # !!! good for continuous data !!!
            "zlevel": 1  # compression level, 9 is the highest
        })

    resampled_data = np.empty(
        shape=(target_raster.height, target_raster.width), dtype=out_meta['dtype'])
    warp.reproject(
        source=rescaled_data,
        destination=resampled_data,
        src_transform=mask.transform,
        src_crs=mask.crs,
        dst_transform=target_transform,
        dst_crs=target_raster.crs,
        resampling=resampling_enum
    )

    output_path = f'{file_dir}/rescaled_resampled_{resampling_method}_{file_name}'

    with rasterio.open(output_path, "w", **out_meta) as dest:
        dest.write(resampled_data, 1)

    print(f"Filtered shapefile saved to: {output_path}")

    return output_path

Shapefile Handling

Efficient Shapefile Processing Techniques

filterShapefile(shapefile_path, countries, country_column_name)

Filters a shapefile based on a list of country names within a specified column.

Parameters:

Name Type Description Default
shapefile_path str

Path to the shapefile to be filtered.

required
countries list of str

List of country names to filter by.

required
country_column_name str

Column name in the shapefile that contains country names.

required

Returns:

Type Description
str

Path to the filtered shapefile saved in the same directory as the original.

Source code in earthstat/geo_data_processing/shapefile_process.py
def filterShapefile(shapefile_path, countries, country_column_name):
    """
    Filters a shapefile based on a list of country names within a specified column.

    Args:
        shapefile_path (str): Path to the shapefile to be filtered.
        countries (list of str): List of country names to filter by.
        country_column_name (str): Column name in the shapefile that contains country names.

    Returns:
        str: Path to the filtered shapefile saved in the same directory as the original.
    """
    file_dir, file_name = savedFilePath(shapefile_path)
    gdf = gpd.read_file(shapefile_path)
    filtered_gdf = gdf[gdf[country_column_name].isin(countries)]
    output_path = f'{file_dir}/filtered_{file_name}'
    filtered_gdf.to_file(output_path)
    print(f"Filtered shapefile saved to: {output_path}")
    return output_path

reprojectShapefileToRaster(raster_data_path, shapefile_path)

Reprojects a shapefile to match the Coordinate Reference System (CRS) of a given raster file.

Parameters:

Name Type Description Default
raster_data_path str

Path to the raster file whose CRS is to be matched.

required
shapefile_path str

Path to the shapefile to be reprojected.

required

Returns:

Type Description
str

Path to the reprojected shapefile saved in the same directory as the original.

Source code in earthstat/geo_data_processing/shapefile_process.py
def reprojectShapefileToRaster(raster_data_path, shapefile_path):
    """
    Reprojects a shapefile to match the Coordinate Reference System (CRS) of a given raster file.

    Args:
        raster_data_path (str): Path to the raster file whose CRS is to be matched.
        shapefile_path (str): Path to the shapefile to be reprojected.

    Returns:
        str: Path to the reprojected shapefile saved in the same directory as the original.
    """
    file_dir, file_name = savedFilePath(shapefile_path)
    with rasterio.open(raster_data_path) as src:
        raster_crs = src.crs

    shapefile = gpd.read_file(shapefile_path)
    output_path = f'{file_dir}/reprojected_{file_name}'
    shapefile_reprojected = shapefile.to_crs(raster_crs)
    shapefile_reprojected.to_file(output_path)

    print(
        f"Shapefile reprojected to match raster CRS and saved to {output_path}")
    return output_path