Source code for bronx.datagrip.netcdf

"""
Utility class to deal with netcdf files.

It allows to generate a simplified representation of a NetCDF Dataset structure
(see the :meth:`netcdf_dataset_structure` method). Additionaly, to NetCDF
datasets can be compared: both the structure of the file and the data will be
compared (see the :meth:`netcdf_dataset_diff` and :meth:`netcdf_file_diff`
methods).

Here is a demo:

First we need a helper function that creates a NetCDF dataset for demonstration
purposes:

    >>> def create_demo_netcdf4(nc_filename):
    ...     demoset = netCDF4.Dataset(nc_filename, mode='w')
    ...     demoset.setncattr('title', 'NetCDF Demo Data')
    ...     x_u = demoset.createDimension('X', 5)
    ...     y_u = demoset.createDimension('Y', 2)
    ...     g1 = demoset.createGroup('group1')
    ...     g2 = demoset.createGroup('group2')
    ...     vx = demoset.createVariable('x', np.float64, ('X', ), zlib=True)
    ...     vx[:] = np.arange(1, 7, 1.4)
    ...     vy = demoset.createVariable('y', np.float64, ('Y', ), zlib=True)
    ...     vy[:] = [0, 1]
    ...     T = g1.createVariable('T', np.float32, ('X', 'Y'))
    ...     T.setncattr('unit', 'Kelvin')
    ...     T[...] = np.reshape(np.arange(270, 280, 1), (5, 2))
    ...     T[1, 0] = np.ma.masked
    ...     z_u = g2.createDimension('Z', size=None)
    ...     T = g2.createVariable('T', np.float32, ('X', 'Y', 'Z'))
    ...     T.setncattr('unit', 'Kelvin')
    ...     T[..., 0] = np.reshape(np.arange(270, 280, 1), (5, 2))
    ...     T[..., 1] = np.reshape(np.arange(300, 310, 1), (5, 2))
    ...     T[0, 0, 0] = np.nan
    ...     return demoset

Create a NetCDF dataset and display its structure:

    >>> import tempfile
    >>> demofile1 = tempfile.NamedTemporaryFile(mode='wb', delete=True)
    >>> demoset1 = create_demo_netcdf4(demofile1.name)
    >>> demodesc1 = netcdf_dataset_structure(demoset1)
    >>> demodesc1 == {
    ...     'dimensions': {'X': {'size': 5, 'unlimited': False},
    ...                    'Y': {'size': 2, 'unlimited': False}},
    ...     'groups': {'group1': {'dimensions': {},
    ...                           'groups': {},
    ...                           'ncattrs': {},
    ...                           'variables': {'T': {'datatype': np.float32,
    ...                                               'dimensions': ('X', 'Y'),
    ...                                               'filters': {'complevel': 0, },
    ...                                               'ncattrs': {'unit': 'Kelvin'},
    ...                                               'shape': (5, 2)}}},
    ...                'group2': {'dimensions': {'Z': {'size': 2, 'unlimited': True}},
    ...                           'groups': {},
    ...                           'ncattrs': {},
    ...                           'variables': {'T': {'datatype': np.float32,
    ...                                               'dimensions': ('X', 'Y', 'Z'),
    ...                                               'filters': {'complevel': 0, },
    ...                                               'ncattrs': {'unit': 'Kelvin'},
    ...                                               'shape': (5, 2, 2)}}}},
    ...     'ncattrs': {'title': 'NetCDF Demo Data'},
    ...     'variables': {'x': {'datatype': np.float64,
    ...                         'dimensions': ('X',),
    ...                         'filters': {'complevel': 4,
    ...                                     'shuffle': True,
    ...                                     'zlib': True},
    ...                         'ncattrs': {},
    ...                         'shape': (5,)},
    ...                   'y': {'datatype': np.float64,
    ...                         'dimensions': ('Y',),
    ...                         'filters': {'complevel': 4,
    ...                                     'shuffle': True,
    ...                                     'zlib': True},
    ...                         'ncattrs': {},
    ...                         'shape': (2,)}}}
    True

First comparison attempt (compare a dataset to itself):

    >>> netcdf_dataset_diff(demoset1, demoset1)
    == Comparison of the two netcdf structures: ==
    (legend: created: "+"  deleted: "-"  unchanged: "="  updated: "?")
    No differences
    == Comparison of data available in both netcdf datasets: ==
    4 data arrays out of 4 are identical
    True

Create a different dataset and perform a comparison:

    >>> demofile2 = tempfile.NamedTemporaryFile(mode='wb', delete=True)
    >>> demoset2 = create_demo_netcdf4(demofile2.name)
    >>> demoset2.delncattr('title')
    >>> demoset2['group1']['T'][1, 0] = 276.
    >>> demoset2['group1']['T'][1, 1] = np.ma.masked
    >>> demoset2['group2']['T'].unit = 'Celsius'
    >>> demoset2['group2']['T'][1, 1, 1] = 0
    >>> s2_g3 = demoset2.createGroup('group3')
    >>> s2_T = s2_g3.createVariable('T', np.float32, ('X', 'Y'))
    >>> s2_T[...] = 0
    >>> netcdf_dataset_diff(demoset1, demoset2)  # doctest: +ELLIPSIS
    == Comparison of the two netcdf structures: ==
    (legend: created: "+"  deleted: "-"  unchanged: "="  updated: "?")
    ? groups:
      | + group3: NetCDF4ParentStructure::<<as_dict:: ...>>
      | ? group2:
      |   | ? variables:
      |   |   | ? T:
      |   |   |   | ? ncattrs:
      |   |   |   |   | ? unit: before='Kelvin' after='Celsius'
    ? ncattrs:
      | - title: 'NetCDF Demo Data'
    == Comparison of data available in both netcdf datasets: ==
    2 data arrays out of 4 are identical
    /group1/T differs
    /group2/T differs
    False

Clean things up...

    >>> demofile1.close()
    >>> demofile2.close()

"""
import netCDF4
import numpy as np

from bronx.stdtypes.tracking import RecursiveMappingTracker


[docs]class NetCDF4ParentStructure(dict):
    """
    Represents the structure of a :class:`NetCDF4.Dataset` or
    :class:`NetCDF4.Group` object.
    """

    def __init__(self, netcdf4_obj):
        """
        :param netcdf4_obj: The Dataset or Group object to process.
        """
        # Sanity checks
        if not isinstance(netcdf4_obj, (netCDF4.Dataset, netCDF4.Group)):
            raise ValueError("'{!r}' is not an appropriate netCDF4 object"
                             .format(netcdf4_obj))
        super().__init__()
        # Read the object's properties
        self['ncattrs'] = {k: netcdf4_obj.getncattr(k) for k in netcdf4_obj.ncattrs()}
        self['groups'] = {k: NetCDF4ParentStructure(g)
                          for k, g in netcdf4_obj.groups.items()}
        self['dimensions'] = {k: NetCDF4DimensionStructure(d)
                              for k, d in netcdf4_obj.dimensions.items()}
        self['variables'] = {k: NetCDF4VariableStructure(v)
                             for k, v in netcdf4_obj.variables.items()}


[docs]class NetCDF4DimensionStructure(dict):
    """Represents the structure of a :class:`NetCDF4.Dimension` object."""

    def __init__(self, netcdf4_obj):
        """
        :param netcdf4_obj: The Dimension object to process.
        """
        # Sanity checks
        if not isinstance(netcdf4_obj, netCDF4.Dimension):
            raise ValueError("'{!r}' is not an appropriate netCDF4 object"
                             .format(netcdf4_obj))
        super().__init__()
        # Read the object's properties
        self['size'] = netcdf4_obj.size
        self['unlimited'] = netcdf4_obj.isunlimited()


[docs]class NetCDF4VariableStructure(dict):
    """Represents the structure of a :class:`NetCDF4.Variable` object."""

    def __init__(self, netcdf4_obj):
        """
        :param netcdf4_obj: The Variable object to process.
        """
        # Sanity checks
        if not isinstance(netcdf4_obj, netCDF4.Variable):
            raise ValueError("'{!r}' is not an appropriate netCDF4 object"
                             .format(netcdf4_obj))
        super().__init__()
        # Read the object's properties
        self['datatype'] = netcdf4_obj.datatype
        self['dimensions'] = netcdf4_obj.dimensions
        self['shape'] = netcdf4_obj.shape
        self['ncattrs'] = {k: netcdf4_obj.getncattr(k) for k in netcdf4_obj.ncattrs()}
        if hasattr(netcdf4_obj.filters(), 'items'):
            self['filters'] = {k: v for k, v in netcdf4_obj.filters().items()
                               if not (v is None or v is False)}
        else:
            self['filters'] = {}


[docs]def netcdf_dataset_structure(netcdf4_obj):
    """
    Generate a representation of the structure of a
    :class:`NetCDF4.Dataset` object.
    """
    return NetCDF4ParentStructure(netcdf4_obj)


[docs]def netcdf_dataset_diff(netcdf4_ref, netcdf4_new, verbose=True):
    """Compare two :class:`NetCDF4.Dataset` objects.

    Both the structure of the NetCDF Dataset and the data will be compared.
    """
    # Compare the netcdf structures
    rmt = RecursiveMappingTracker(
        netcdf_dataset_structure(netcdf4_ref),
        netcdf_dataset_structure(netcdf4_new)
    )
    rc = not (len(rmt))
    if verbose or not rc:
        print('== Comparison of the two netcdf structures: ==')
        rmt.dump_legend()
        rmt.differences()

    def all_variables(netcdf4_obj):
        """Helper fonction to recursively list the NetCDF variables."""
        variables = dict()
        for nc_var in netcdf4_obj.variables.values():
            where = nc_var.group().path.rstrip('/') + '/' + nc_var.name
            variables[where] = nc_var
        for nc_group in netcdf4_obj.groups.values():
            variables.update(all_variables(nc_group))
        return variables

    # Generate the list of variables
    ref_vars = all_variables(netcdf4_ref)
    new_vars = all_variables(netcdf4_new)
    # Compare the data for common items
    common_vars = set(ref_vars.keys()) & set(new_vars.keys())
    updated_vars = set()
    for path_to_var in common_vars:
        rc = True
        ref_values = ref_vars.pop(path_to_var)[...]
        new_values = new_vars.pop(path_to_var)[...]
        # Specifically look for NaNs
        try:
            ref_nans = np.isnan(ref_values).filled(False)
            new_nans = np.isnan(new_values).filled(False)
        except TypeError:
            # a TypeError is raised if ref_values or new_values are not
            # numerical data (e.g strings)
            numeric_type = False
        else:
            numeric_type = True
        if numeric_type and (np.any(ref_nans) or np.any(new_nans)):
            if not np.array_equal(ref_nans, new_nans):
                updated_vars.add(path_to_var)
                continue
            ref_values[ref_nans] = np.ma.masked
            new_values[new_nans] = np.ma.masked
        if np.any(ref_values.mask) or np.any(new_values.mask):
            if not np.array_equal(new_values.mask, ref_values.mask):
                updated_vars.add(path_to_var)
                continue
            ref_values = ref_values.compressed()
            new_values = new_values.compressed()
        if not np.array_equal(new_values, ref_values):
            updated_vars.add(path_to_var)
    # Summary
    if verbose or updated_vars:
        print('== Comparison of data available in both netcdf datasets: ==')
        print('{:d} data arrays out of {:d} are identical'
              .format(len(common_vars) - len(updated_vars), len(common_vars)))
        if updated_vars:
            rc = False
            for path_to_var in sorted(updated_vars):
                print('{:s} differs'.format(path_to_var))
    return rc


[docs]def netcdf_file_diff(netcdf_file_ref, netcdf_file_new):
    """Compare two NetCDF files."""
    return netcdf_dataset_diff(netCDF4.Dataset(netcdf_file_ref, mode='r'),
                               netCDF4.Dataset(netcdf_file_new, mode='r'))


if __name__ == '__main__':
    import doctest
    doctest.testmod()