"""
Utility class to deal with netcdf files.
It allows to generate a simplified representation of a NetCDF Dataset structure
(see the :meth:`netcdf_dataset_structure` method). Additionaly, to NetCDF
datasets can be compared: both the structure of the file and the data will be
compared (see the :meth:`netcdf_dataset_diff` and :meth:`netcdf_file_diff`
methods).
Here is a demo:
First we need a helper function that creates a NetCDF dataset for demonstration
purposes:
>>> def create_demo_netcdf4(nc_filename):
... demoset = netCDF4.Dataset(nc_filename, mode='w')
... demoset.setncattr('title', 'NetCDF Demo Data')
... x_u = demoset.createDimension('X', 5)
... y_u = demoset.createDimension('Y', 2)
... g1 = demoset.createGroup('group1')
... g2 = demoset.createGroup('group2')
... vx = demoset.createVariable('x', np.float64, ('X', ), zlib=True)
... vx[:] = np.arange(1, 7, 1.4)
... vy = demoset.createVariable('y', np.float64, ('Y', ), zlib=True)
... vy[:] = [0, 1]
... T = g1.createVariable('T', np.float32, ('X', 'Y'))
... T.setncattr('unit', 'Kelvin')
... T[...] = np.reshape(np.arange(270, 280, 1), (5, 2))
... T[1, 0] = np.ma.masked
... z_u = g2.createDimension('Z', size=None)
... T = g2.createVariable('T', np.float32, ('X', 'Y', 'Z'))
... T.setncattr('unit', 'Kelvin')
... T[..., 0] = np.reshape(np.arange(270, 280, 1), (5, 2))
... T[..., 1] = np.reshape(np.arange(300, 310, 1), (5, 2))
... T[0, 0, 0] = np.nan
... return demoset
Create a NetCDF dataset and display its structure:
>>> import tempfile
>>> demofile1 = tempfile.NamedTemporaryFile(mode='wb', delete=True)
>>> demoset1 = create_demo_netcdf4(demofile1.name)
>>> demodesc1 = netcdf_dataset_structure(demoset1)
>>> demodesc1 == {
... 'dimensions': {'X': {'size': 5, 'unlimited': False},
... 'Y': {'size': 2, 'unlimited': False}},
... 'groups': {'group1': {'dimensions': {},
... 'groups': {},
... 'ncattrs': {},
... 'variables': {'T': {'datatype': np.float32,
... 'dimensions': ('X', 'Y'),
... 'filters': {'complevel': 0, },
... 'ncattrs': {'unit': 'Kelvin'},
... 'shape': (5, 2)}}},
... 'group2': {'dimensions': {'Z': {'size': 2, 'unlimited': True}},
... 'groups': {},
... 'ncattrs': {},
... 'variables': {'T': {'datatype': np.float32,
... 'dimensions': ('X', 'Y', 'Z'),
... 'filters': {'complevel': 0, },
... 'ncattrs': {'unit': 'Kelvin'},
... 'shape': (5, 2, 2)}}}},
... 'ncattrs': {'title': 'NetCDF Demo Data'},
... 'variables': {'x': {'datatype': np.float64,
... 'dimensions': ('X',),
... 'filters': {'complevel': 4,
... 'shuffle': True,
... 'zlib': True},
... 'ncattrs': {},
... 'shape': (5,)},
... 'y': {'datatype': np.float64,
... 'dimensions': ('Y',),
... 'filters': {'complevel': 4,
... 'shuffle': True,
... 'zlib': True},
... 'ncattrs': {},
... 'shape': (2,)}}}
True
First comparison attempt (compare a dataset to itself):
>>> netcdf_dataset_diff(demoset1, demoset1)
== Comparison of the two netcdf structures: ==
(legend: created: "+" deleted: "-" unchanged: "=" updated: "?")
No differences
== Comparison of data available in both netcdf datasets: ==
4 data arrays out of 4 are identical
True
Create a different dataset and perform a comparison:
>>> demofile2 = tempfile.NamedTemporaryFile(mode='wb', delete=True)
>>> demoset2 = create_demo_netcdf4(demofile2.name)
>>> demoset2.delncattr('title')
>>> demoset2['group1']['T'][1, 0] = 276.
>>> demoset2['group1']['T'][1, 1] = np.ma.masked
>>> demoset2['group2']['T'].unit = 'Celsius'
>>> demoset2['group2']['T'][1, 1, 1] = 0
>>> s2_g3 = demoset2.createGroup('group3')
>>> s2_T = s2_g3.createVariable('T', np.float32, ('X', 'Y'))
>>> s2_T[...] = 0
>>> netcdf_dataset_diff(demoset1, demoset2) # doctest: +ELLIPSIS
== Comparison of the two netcdf structures: ==
(legend: created: "+" deleted: "-" unchanged: "=" updated: "?")
? groups:
| + group3: NetCDF4ParentStructure::<<as_dict:: ...>>
| ? group2:
| | ? variables:
| | | ? T:
| | | | ? ncattrs:
| | | | | ? unit: before='Kelvin' after='Celsius'
? ncattrs:
| - title: 'NetCDF Demo Data'
== Comparison of data available in both netcdf datasets: ==
2 data arrays out of 4 are identical
/group1/T differs
/group2/T differs
False
Clean things up...
>>> demofile1.close()
>>> demofile2.close()
"""
import netCDF4
import numpy as np
from bronx.stdtypes.tracking import RecursiveMappingTracker
[docs]class NetCDF4ParentStructure(dict):
"""
Represents the structure of a :class:`NetCDF4.Dataset` or
:class:`NetCDF4.Group` object.
"""
def __init__(self, netcdf4_obj):
"""
:param netcdf4_obj: The Dataset or Group object to process.
"""
# Sanity checks
if not isinstance(netcdf4_obj, (netCDF4.Dataset, netCDF4.Group)):
raise ValueError("'{!r}' is not an appropriate netCDF4 object"
.format(netcdf4_obj))
super().__init__()
# Read the object's properties
self['ncattrs'] = {k: netcdf4_obj.getncattr(k) for k in netcdf4_obj.ncattrs()}
self['groups'] = {k: NetCDF4ParentStructure(g)
for k, g in netcdf4_obj.groups.items()}
self['dimensions'] = {k: NetCDF4DimensionStructure(d)
for k, d in netcdf4_obj.dimensions.items()}
self['variables'] = {k: NetCDF4VariableStructure(v)
for k, v in netcdf4_obj.variables.items()}
[docs]class NetCDF4DimensionStructure(dict):
"""Represents the structure of a :class:`NetCDF4.Dimension` object."""
def __init__(self, netcdf4_obj):
"""
:param netcdf4_obj: The Dimension object to process.
"""
# Sanity checks
if not isinstance(netcdf4_obj, netCDF4.Dimension):
raise ValueError("'{!r}' is not an appropriate netCDF4 object"
.format(netcdf4_obj))
super().__init__()
# Read the object's properties
self['size'] = netcdf4_obj.size
self['unlimited'] = netcdf4_obj.isunlimited()
[docs]class NetCDF4VariableStructure(dict):
"""Represents the structure of a :class:`NetCDF4.Variable` object."""
def __init__(self, netcdf4_obj):
"""
:param netcdf4_obj: The Variable object to process.
"""
# Sanity checks
if not isinstance(netcdf4_obj, netCDF4.Variable):
raise ValueError("'{!r}' is not an appropriate netCDF4 object"
.format(netcdf4_obj))
super().__init__()
# Read the object's properties
self['datatype'] = netcdf4_obj.datatype
self['dimensions'] = netcdf4_obj.dimensions
self['shape'] = netcdf4_obj.shape
self['ncattrs'] = {k: netcdf4_obj.getncattr(k) for k in netcdf4_obj.ncattrs()}
if hasattr(netcdf4_obj.filters(), 'items'):
self['filters'] = {k: v for k, v in netcdf4_obj.filters().items()
if not (v is None or v is False)}
else:
self['filters'] = {}
[docs]def netcdf_dataset_structure(netcdf4_obj):
"""
Generate a representation of the structure of a
:class:`NetCDF4.Dataset` object.
"""
return NetCDF4ParentStructure(netcdf4_obj)
[docs]def netcdf_dataset_diff(netcdf4_ref, netcdf4_new, verbose=True):
"""Compare two :class:`NetCDF4.Dataset` objects.
Both the structure of the NetCDF Dataset and the data will be compared.
"""
# Compare the netcdf structures
rmt = RecursiveMappingTracker(
netcdf_dataset_structure(netcdf4_ref),
netcdf_dataset_structure(netcdf4_new)
)
rc = not (len(rmt))
if verbose or not rc:
print('== Comparison of the two netcdf structures: ==')
rmt.dump_legend()
rmt.differences()
def all_variables(netcdf4_obj):
"""Helper fonction to recursively list the NetCDF variables."""
variables = dict()
for nc_var in netcdf4_obj.variables.values():
where = nc_var.group().path.rstrip('/') + '/' + nc_var.name
variables[where] = nc_var
for nc_group in netcdf4_obj.groups.values():
variables.update(all_variables(nc_group))
return variables
# Generate the list of variables
ref_vars = all_variables(netcdf4_ref)
new_vars = all_variables(netcdf4_new)
# Compare the data for common items
common_vars = set(ref_vars.keys()) & set(new_vars.keys())
updated_vars = set()
for path_to_var in common_vars:
rc = True
ref_values = ref_vars.pop(path_to_var)[...]
new_values = new_vars.pop(path_to_var)[...]
# Specifically look for NaNs
try:
ref_nans = np.isnan(ref_values).filled(False)
new_nans = np.isnan(new_values).filled(False)
except TypeError:
# a TypeError is raised if ref_values or new_values are not
# numerical data (e.g strings)
numeric_type = False
else:
numeric_type = True
if numeric_type and (np.any(ref_nans) or np.any(new_nans)):
if not np.array_equal(ref_nans, new_nans):
updated_vars.add(path_to_var)
continue
ref_values[ref_nans] = np.ma.masked
new_values[new_nans] = np.ma.masked
if np.any(ref_values.mask) or np.any(new_values.mask):
if not np.array_equal(new_values.mask, ref_values.mask):
updated_vars.add(path_to_var)
continue
ref_values = ref_values.compressed()
new_values = new_values.compressed()
if not np.array_equal(new_values, ref_values):
updated_vars.add(path_to_var)
# Summary
if verbose or updated_vars:
print('== Comparison of data available in both netcdf datasets: ==')
print('{:d} data arrays out of {:d} are identical'
.format(len(common_vars) - len(updated_vars), len(common_vars)))
if updated_vars:
rc = False
for path_to_var in sorted(updated_vars):
print('{:s} differs'.format(path_to_var))
return rc
[docs]def netcdf_file_diff(netcdf_file_ref, netcdf_file_new):
"""Compare two NetCDF files."""
return netcdf_dataset_diff(netCDF4.Dataset(netcdf_file_ref, mode='r'),
netCDF4.Dataset(netcdf_file_new, mode='r'))
if __name__ == '__main__':
import doctest
doctest.testmod()