Cloud optimized access to NASA data with earthaccess and virtualizarr¶
This notebook will focus on the usage of earthaccess.open_virtual_dataset
and earthaccess.open_virtual_mfdataset
to create cloud optimized reference files for the data stored in the cloud.
All of the examples in this tutorial load data over https (access="indirect"
). However, there is a significant speed improvement when using these functions in-cloud and enabling access="direct"
. For example, using managed cloud JupyterHubs like NASA VEDA or 2i2c Openscapes. This is because the data is streamed directly from cloud storage to cloud compute.
WARNING: This feature is current experimental and may change in the future. This feature relies on NASA DMR++ metadata files which may not always be present for your dataset and you may get a
FileNotFoundError
.
import earthaccess
import xarray as xr
# NASA JPL Multiscale Ultrahigh Resolution (MUR) Sea Surface Temperature (SST) dataset - 0.01 degree resolution
results = earthaccess.search_data(
temporal=("2010-01-01", "2010-01-31"), short_name="MUR-JPL-L4-GLOB-v4.1"
)
len(results)
32
%%time
mur = earthaccess.open_virtual_mfdataset(
results,
access="indirect",
load=True,
concat_dim="time",
coords="all",
compat="override",
combine_attrs="drop_conflicts",
)
mur
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) File <timed exec>:1 File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/earthaccess/dmrpp_zarr.py:131, in open_virtual_mfdataset(granules, group, access, load, preprocess, parallel, **xr_combine_nested_kwargs) 129 refs = vds.virtualize.to_kerchunk(filepath=None, format="dict") 130 protocol = "s3" if "s3" in fs.protocol else fs.protocol --> 131 return xr.open_dataset( 132 "reference://", 133 engine="zarr", 134 chunks={}, 135 backend_kwargs={ 136 "consolidated": False, 137 "storage_options": { 138 "fo": refs, # codespell:ignore 139 "remote_protocol": protocol, 140 "remote_options": fs.storage_options, 141 }, 142 }, 143 ) 144 return vds File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/api.py:687, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs) 675 decoders = _resolve_decoders_kwargs( 676 decode_cf, 677 open_backend_dataset_parameters=backend.open_dataset_parameters, (...) 683 decode_coords=decode_coords, 684 ) 686 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) --> 687 backend_ds = backend.open_dataset( 688 filename_or_obj, 689 drop_variables=drop_variables, 690 **decoders, 691 **kwargs, 692 ) 693 ds = _dataset_from_backend_dataset( 694 backend_ds, 695 filename_or_obj, (...) 705 **kwargs, 706 ) 707 return ds File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/zarr.py:1608, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members) 1606 filename_or_obj = _normalize_path(filename_or_obj) 1607 if not store: -> 1608 store = ZarrStore.open_group( 1609 filename_or_obj, 1610 group=group, 1611 mode=mode, 1612 synchronizer=synchronizer, 1613 consolidated=consolidated, 1614 consolidate_on_close=False, 1615 chunk_store=chunk_store, 1616 storage_options=storage_options, 1617 zarr_version=zarr_version, 1618 use_zarr_fill_value_as_mask=None, 1619 zarr_format=zarr_format, 1620 cache_members=cache_members, 1621 ) 1623 store_entrypoint = StoreBackendEntrypoint() 1624 with close_on_error(store): File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/zarr.py:732, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members) 707 @classmethod 708 def open_group( 709 cls, (...) 725 cache_members: bool = True, 726 ): 727 ( 728 zarr_group, 729 consolidate_on_close, 730 close_store_on_close, 731 use_zarr_fill_value_as_mask, --> 732 ) = _get_open_params( 733 store=store, 734 mode=mode, 735 synchronizer=synchronizer, 736 group=group, 737 consolidated=consolidated, 738 consolidate_on_close=consolidate_on_close, 739 chunk_store=chunk_store, 740 storage_options=storage_options, 741 zarr_version=zarr_version, 742 use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, 743 zarr_format=zarr_format, 744 ) 746 return cls( 747 zarr_group, 748 mode, (...) 756 cache_members, 757 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/zarr.py:1845, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format) 1841 if _zarr_v3(): 1842 # we have determined that we don't want to use consolidated metadata 1843 # so we set that to False to avoid trying to read it 1844 open_kwargs["use_consolidated"] = False -> 1845 zarr_group = zarr.open_group(store, **open_kwargs) 1847 close_store_on_close = zarr_group.store is not store 1849 # we use this to determine how to handle fill_value File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/_compat.py:43, in _deprecate_positional_args.<locals>._inner_deprecate_positional_args.<locals>.inner_f(*args, **kwargs) 41 extra_args = len(args) - len(all_args) 42 if extra_args <= 0: ---> 43 return f(*args, **kwargs) 45 # extra_args > 0 46 args_msg = [ 47 f"{name}={arg}" 48 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:], strict=False) 49 ] File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/api/synchronous.py:527, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated) 449 @_deprecate_positional_args 450 def open_group( 451 store: StoreLike | None = None, (...) 463 use_consolidated: bool | str | None = None, 464 ) -> Group: 465 """Open a group using file-mode-like semantics. 466 467 Parameters (...) 524 The new group. 525 """ 526 return Group( --> 527 sync( 528 async_api.open_group( 529 store=store, 530 mode=mode, 531 cache_attrs=cache_attrs, 532 synchronizer=synchronizer, 533 path=path, 534 chunk_store=chunk_store, 535 storage_options=storage_options, 536 zarr_version=zarr_version, 537 zarr_format=zarr_format, 538 meta_array=meta_array, 539 attributes=attributes, 540 use_consolidated=use_consolidated, 541 ) 542 ) 543 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/core/sync.py:163, in sync(coro, loop, timeout) 160 return_result = next(iter(finished)).result() 162 if isinstance(return_result, BaseException): --> 163 raise return_result 164 else: 165 return return_result File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/core/sync.py:119, in _runner(coro) 114 """ 115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an 116 exception, the exception will be returned. 117 """ 118 try: --> 119 return await coro 120 except Exception as ex: 121 return ex File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/api/asynchronous.py:806, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated) 803 if chunk_store is not None: 804 warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2) --> 806 store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path) 808 if attributes is None: 809 attributes = {} File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/storage/_common.py:305, in make_store_path(store_like, path, mode, storage_options) 303 if _is_fsspec_uri(store_like): 304 used_storage_options = True --> 305 store = FsspecStore.from_url( 306 store_like, storage_options=storage_options, read_only=_read_only 307 ) 308 else: 309 store = await LocalStore.open(root=Path(store_like), read_only=_read_only) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/storage/_fsspec.py:176, in FsspecStore.from_url(cls, url, storage_options, read_only, allowed_exceptions) 173 opts = storage_options or {} 174 opts = {"asynchronous": True, **opts} --> 176 fs, path = url_to_fs(url, **opts) 177 if not fs.async_impl: 178 try: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/core.py:415, in url_to_fs(url, **kwargs) 413 inkwargs["fo"] = urls 414 urlpath, protocol, _ = chain[0] --> 415 fs = filesystem(protocol, **inkwargs) 416 return fs, urlpath File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/registry.py:310, in filesystem(protocol, **storage_options) 303 warnings.warn( 304 "The 'arrow_hdfs' protocol has been deprecated and will be " 305 "removed in the future. Specify it as 'hdfs'.", 306 DeprecationWarning, 307 ) 309 cls = get_filesystem_class(protocol) --> 310 return cls(**storage_options) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/spec.py:81, in _Cached.__call__(cls, *args, **kwargs) 79 return cls._cache[token] 80 else: ---> 81 obj = super().__call__(*args, **kwargs) 82 # Setting _fs_token here causes some static linters to complain. 83 obj._fs_token_ = token File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/implementations/reference.py:770, in ReferenceFileSystem.__init__(self, fo, target, ref_storage_args, target_protocol, target_options, remote_protocol, remote_options, fs, template_overrides, simple_templates, max_gap, max_block, cache_size, **kwargs) 768 self.fss[k] = AsyncFileSystemWrapper(f, asynchronous=self.asynchronous) 769 elif self.asynchronous ^ f.asynchronous: --> 770 raise ValueError( 771 "Reference-FS's target filesystem must have same value" 772 "of asynchronous" 773 ) ValueError: Reference-FS's target filesystem must have same valueof asynchronous
print(f"{mur.nbytes / 1e9} GB")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[4], line 1 ----> 1 print(f"{mur.nbytes / 1e9} GB") NameError: name 'mur' is not defined
mur.isel(time=0).sel(
lat=slice(20, 45), lon=slice(-95, -50)
).analysed_sst.plot.pcolormesh(x="lon", y="lat", cmap="plasma", figsize=(8, 4))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[5], line 1 ----> 1 mur.isel(time=0).sel( 2 lat=slice(20, 45), lon=slice(-95, -50) 3 ).analysed_sst.plot.pcolormesh(x="lon", y="lat", cmap="plasma", figsize=(8, 4)) NameError: name 'mur' is not defined
Save virtual reference file and load with xarray¶
If you have a dataset you frequently access or you want to share this blueprint file with others, it is recommended to create a virtual reference file that points to the data in the cloud. This allows xarray to rapidly load the dataset as if it was a Zarr store.
Notice below that load=False
. This means that the output of open_virtual_mfdataset
is a virtual xarray Dataset that contains only chunk information and metadata. You can modify this dataset, then save it to a virtual reference file (as JSON), and then simply load that file with xarray. For more information on virtual reference files, see the virtualizarr documentation.
Sample workflow:
- Open a dataset with
open_virtual_mfdataset
withload=False
- Modify the dataset as needed
- Save the dataset to a virtual reference file with
vds.virtualize.to_kerchunk(...)
- Load the virtual reference file with
xr.open_dataset(..., engine='kerchunk')
%%time
mur_vds = earthaccess.open_virtual_mfdataset(
results,
access="indirect",
load=False,
concat_dim="time",
coords="all",
compat="override",
combine_attrs="drop_conflicts",
)
mur_vds
CPU times: user 926 ms, sys: 21.8 ms, total: 948 ms Wall time: 24.4 s
<xarray.Dataset> Size: 124GB Dimensions: (time: 32, lat: 17999, lon: 36000) Coordinates: time (time) int32 128B ManifestArray<shape=(32,), dtype=int3... lat (lat) float32 72kB ManifestArray<shape=(17999,), dtype=... lon (lon) float32 144kB ManifestArray<shape=(36000,), dtype... Data variables: mask (time, lat, lon) int8 21GB ManifestArray<shape=(32, 179... sea_ice_fraction (time, lat, lon) int8 21GB ManifestArray<shape=(32, 179... analysed_sst (time, lat, lon) int16 41GB ManifestArray<shape=(32, 17... analysis_error (time, lat, lon) int16 41GB ManifestArray<shape=(32, 17... Attributes: (12/42) Conventions: CF-1.5 title: Daily MUR SST, Final product summary: A merged, multi-sensor L4 Foundation SST anal... references: http://podaac.jpl.nasa.gov/Multi-scale_Ultra-... institution: Jet Propulsion Laboratory history: created at nominal 4-day latency; replaced nr... ... ... project: NASA Making Earth Science Data Records for Us... publisher_name: GHRSST Project Office publisher_url: http://www.ghrsst.org publisher_email: ghrsst-po@nceo.ac.uk processing_level: L4 cdm_data_type: grid
# Example of what's inside this virtual dataset
print(mur_vds.analysed_sst.data.zarray)
print(mur_vds.analysed_sst.data.manifest.dict()["0.0.1"])
ZArray(shape=(32, 17999, 36000), chunks=(1, 1023, 2047), dtype=dtype('int16'), fill_value=-32768, order='C', compressor=None, filters=[{'id': 'shuffle', 'elementsize': 2}, {'id': 'zlib', 'level': 6}], zarr_format=2) {'path': 'https://archive.podaac.earthdata.nasa.gov/podaac-ops-cumulus-protected/MUR-JPL-L4-GLOB-v4.1/20100101090000-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc', 'offset': 44835, 'length': 4083}
mur_vds.virtualize.to_kerchunk(filepath="mur_kerchunk.json", format="json")
%%time
fs = earthaccess.get_fsspec_https_session()
ds = xr.open_dataset(
"reference://",
engine="zarr",
chunks={},
backend_kwargs={
"consolidated": False,
"storage_options": {
"fo": "mur_kerchunk.json",
"remote_protocol": fs.protocol,
"remote_options": fs.storage_options,
},
},
)
print(ds)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) File <timed exec>:2 File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/api.py:687, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, chunked_array_type, from_array_kwargs, backend_kwargs, **kwargs) 675 decoders = _resolve_decoders_kwargs( 676 decode_cf, 677 open_backend_dataset_parameters=backend.open_dataset_parameters, (...) 683 decode_coords=decode_coords, 684 ) 686 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) --> 687 backend_ds = backend.open_dataset( 688 filename_or_obj, 689 drop_variables=drop_variables, 690 **decoders, 691 **kwargs, 692 ) 693 ds = _dataset_from_backend_dataset( 694 backend_ds, 695 filename_or_obj, (...) 705 **kwargs, 706 ) 707 return ds File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/zarr.py:1608, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, zarr_version, zarr_format, store, engine, use_zarr_fill_value_as_mask, cache_members) 1606 filename_or_obj = _normalize_path(filename_or_obj) 1607 if not store: -> 1608 store = ZarrStore.open_group( 1609 filename_or_obj, 1610 group=group, 1611 mode=mode, 1612 synchronizer=synchronizer, 1613 consolidated=consolidated, 1614 consolidate_on_close=False, 1615 chunk_store=chunk_store, 1616 storage_options=storage_options, 1617 zarr_version=zarr_version, 1618 use_zarr_fill_value_as_mask=None, 1619 zarr_format=zarr_format, 1620 cache_members=cache_members, 1621 ) 1623 store_entrypoint = StoreBackendEntrypoint() 1624 with close_on_error(store): File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/zarr.py:732, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, zarr_version, zarr_format, use_zarr_fill_value_as_mask, write_empty, cache_members) 707 @classmethod 708 def open_group( 709 cls, (...) 725 cache_members: bool = True, 726 ): 727 ( 728 zarr_group, 729 consolidate_on_close, 730 close_store_on_close, 731 use_zarr_fill_value_as_mask, --> 732 ) = _get_open_params( 733 store=store, 734 mode=mode, 735 synchronizer=synchronizer, 736 group=group, 737 consolidated=consolidated, 738 consolidate_on_close=consolidate_on_close, 739 chunk_store=chunk_store, 740 storage_options=storage_options, 741 zarr_version=zarr_version, 742 use_zarr_fill_value_as_mask=use_zarr_fill_value_as_mask, 743 zarr_format=zarr_format, 744 ) 746 return cls( 747 zarr_group, 748 mode, (...) 756 cache_members, 757 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/xarray/backends/zarr.py:1845, in _get_open_params(store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, zarr_version, use_zarr_fill_value_as_mask, zarr_format) 1841 if _zarr_v3(): 1842 # we have determined that we don't want to use consolidated metadata 1843 # so we set that to False to avoid trying to read it 1844 open_kwargs["use_consolidated"] = False -> 1845 zarr_group = zarr.open_group(store, **open_kwargs) 1847 close_store_on_close = zarr_group.store is not store 1849 # we use this to determine how to handle fill_value File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/_compat.py:43, in _deprecate_positional_args.<locals>._inner_deprecate_positional_args.<locals>.inner_f(*args, **kwargs) 41 extra_args = len(args) - len(all_args) 42 if extra_args <= 0: ---> 43 return f(*args, **kwargs) 45 # extra_args > 0 46 args_msg = [ 47 f"{name}={arg}" 48 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:], strict=False) 49 ] File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/api/synchronous.py:527, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated) 449 @_deprecate_positional_args 450 def open_group( 451 store: StoreLike | None = None, (...) 463 use_consolidated: bool | str | None = None, 464 ) -> Group: 465 """Open a group using file-mode-like semantics. 466 467 Parameters (...) 524 The new group. 525 """ 526 return Group( --> 527 sync( 528 async_api.open_group( 529 store=store, 530 mode=mode, 531 cache_attrs=cache_attrs, 532 synchronizer=synchronizer, 533 path=path, 534 chunk_store=chunk_store, 535 storage_options=storage_options, 536 zarr_version=zarr_version, 537 zarr_format=zarr_format, 538 meta_array=meta_array, 539 attributes=attributes, 540 use_consolidated=use_consolidated, 541 ) 542 ) 543 ) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/core/sync.py:163, in sync(coro, loop, timeout) 160 return_result = next(iter(finished)).result() 162 if isinstance(return_result, BaseException): --> 163 raise return_result 164 else: 165 return return_result File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/core/sync.py:119, in _runner(coro) 114 """ 115 Await a coroutine and return the result of running it. If awaiting the coroutine raises an 116 exception, the exception will be returned. 117 """ 118 try: --> 119 return await coro 120 except Exception as ex: 121 return ex File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/api/asynchronous.py:806, in open_group(store, mode, cache_attrs, synchronizer, path, chunk_store, storage_options, zarr_version, zarr_format, meta_array, attributes, use_consolidated) 803 if chunk_store is not None: 804 warnings.warn("chunk_store is not yet implemented", RuntimeWarning, stacklevel=2) --> 806 store_path = await make_store_path(store, mode=mode, storage_options=storage_options, path=path) 808 if attributes is None: 809 attributes = {} File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/storage/_common.py:305, in make_store_path(store_like, path, mode, storage_options) 303 if _is_fsspec_uri(store_like): 304 used_storage_options = True --> 305 store = FsspecStore.from_url( 306 store_like, storage_options=storage_options, read_only=_read_only 307 ) 308 else: 309 store = await LocalStore.open(root=Path(store_like), read_only=_read_only) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/zarr/storage/_fsspec.py:176, in FsspecStore.from_url(cls, url, storage_options, read_only, allowed_exceptions) 173 opts = storage_options or {} 174 opts = {"asynchronous": True, **opts} --> 176 fs, path = url_to_fs(url, **opts) 177 if not fs.async_impl: 178 try: File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/core.py:415, in url_to_fs(url, **kwargs) 413 inkwargs["fo"] = urls 414 urlpath, protocol, _ = chain[0] --> 415 fs = filesystem(protocol, **inkwargs) 416 return fs, urlpath File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/registry.py:310, in filesystem(protocol, **storage_options) 303 warnings.warn( 304 "The 'arrow_hdfs' protocol has been deprecated and will be " 305 "removed in the future. Specify it as 'hdfs'.", 306 DeprecationWarning, 307 ) 309 cls = get_filesystem_class(protocol) --> 310 return cls(**storage_options) File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/spec.py:81, in _Cached.__call__(cls, *args, **kwargs) 79 return cls._cache[token] 80 else: ---> 81 obj = super().__call__(*args, **kwargs) 82 # Setting _fs_token here causes some static linters to complain. 83 obj._fs_token_ = token File ~/checkouts/readthedocs.org/user_builds/earthaccess/envs/973/lib/python3.11/site-packages/fsspec/implementations/reference.py:770, in ReferenceFileSystem.__init__(self, fo, target, ref_storage_args, target_protocol, target_options, remote_protocol, remote_options, fs, template_overrides, simple_templates, max_gap, max_block, cache_size, **kwargs) 768 self.fss[k] = AsyncFileSystemWrapper(f, asynchronous=self.asynchronous) 769 elif self.asynchronous ^ f.asynchronous: --> 770 raise ValueError( 771 "Reference-FS's target filesystem must have same value" 772 "of asynchronous" 773 ) ValueError: Reference-FS's target filesystem must have same valueof asynchronous
Read datasets with groups¶
# NASA TEMPO NO2 tropospheric and stratospheric columns V03
results = earthaccess.search_data(count=2, doi="10.5067/IS-40e/TEMPO/NO2_L2.003")
len(results)
2
earthaccess.open_virtual_dataset(results[0], group="product")
<xarray.Dataset> Size: 7MB Dimensions: (mirror_step: 123, xtrack: 2048) Dimensions without coordinates: mirror_step, xtrack Data variables: main_data_quality_flag (mirror_step, xtrack) int16 504kB ManifestArray<shape=(123, 2048), dtype=int16, chunks=(123, 2048... vertical_column_troposphere (mirror_step, xtrack) float64 2MB ManifestArray<shape=(123, 2048), dtype=float64, chunks=(123, 2048... vertical_column_stratosphere (mirror_step, xtrack) float64 2MB ManifestArray<shape=(123, 2048), dtype=float64, chunks=(123, 2048... vertical_column_troposphere_uncertainty (mirror_step, xtrack) float64 2MB ManifestArray<shape=(123, 2048), dtype=float64, chunks=(123, 2048...
Advanced: Preprocess the datasets¶
You can also preprocess the datasets before saving the virtual reference file. This is useful if you want to apply a function to the datasets before concatentaion. For example, the SWOT_L2_LR_SSH_Expert_2.0
dataset (from NASA JPL SWOT satellite) is an L2 product where each file represents a single pass of the satellite. If you want to combine all the passes into a single dataset, you can concatenate the datasets using cycle_number
and pass_number
which are only found in the attributes of each netcdf file.
The preprocess
function and argument allows us to turn those attributes into dimensions first, and then concatenate along this new dimension.
results = earthaccess.search_data(
count=10, temporal=("2023"), short_name="SWOT_L2_LR_SSH_Expert_2.0"
)
%%time
def preprocess(ds: xr.Dataset) -> xr.Dataset:
# Add cycle number and pass_number as dimensions
return ds.expand_dims(["cycle_num", "pass_num"]).assign_coords(
cycle_num=[ds.attrs["cycle_number"]], pass_num=[ds.attrs["pass_number"]]
)
swot = earthaccess.open_virtual_mfdataset(
results,
access="indirect",
load=False,
preprocess=preprocess,
concat_dim="pass_num",
coords="all",
compat="override",
combine_attrs="drop_conflicts",
)
swot
CPU times: user 530 ms, sys: 10.2 ms, total: 540 ms Wall time: 10.9 s
<xarray.Dataset> Size: 1GB Dimensions: (cycle_num: 1, pass_num: 10, num_lines: 9866, num_pixels: 69, num_sides: 2) Coordinates: longitude (pass_num, num_lines, num_pixels) int32 27MB ManifestArray<shape=(10, 9866, 69), dtype=int32, chunks=(1,... latitude (pass_num, num_lines, num_pixels) int32 27MB ManifestArray<shape=(10, 9866, 69), dtype=int32, chunks=(1,... latitude_nadir (pass_num, num_lines) int32 395kB ... longitude_nadir (pass_num, num_lines) int32 395kB ... * cycle_num (cycle_num) int64 8B 1 * pass_num (pass_num) int64 80B 149 150 ... 158 Dimensions without coordinates: num_lines, num_pixels, num_sides Data variables: (12/98) height_cor_xover_qual (cycle_num, pass_num, num_lines, num_pixels) uint8 7MB ManifestArray<shape=(1, 10, 9866, 69), dtype=uint8, chu... swh_ssb_cor_source (cycle_num, pass_num, num_lines, num_pixels) uint8 7MB ManifestArray<shape=(1, 10, 9866, 69), dtype=uint8, chu... rain_rate (cycle_num, pass_num, num_lines, num_pixels) uint8 7MB ManifestArray<shape=(1, 10, 9866, 69), dtype=uint8, chu... dynamic_ice_flag (cycle_num, pass_num, num_lines, num_pixels) uint8 7MB ManifestArray<shape=(1, 10, 9866, 69), dtype=uint8, chu... orbit_qual (cycle_num, pass_num, num_lines) uint8 99kB ManifestArray<shape=(1, 10, 9866), dtype=uint8, chunks=(1,... ancillary_surface_classification_flag (cycle_num, pass_num, num_lines, num_pixels) uint8 7MB ManifestArray<shape=(1, 10, 9866, 69), dtype=uint8, chu... ... ... sig0_cor_atmos_model (cycle_num, pass_num, num_lines, num_pixels) float32 27MB ManifestArray<shape=(1, 10, 9866, 69), dtype=float32, ... cross_track_distance (cycle_num, pass_num, num_lines, num_pixels) float32 27MB ManifestArray<shape=(1, 10, 9866, 69), dtype=float32, ... sig0_karin_uncert (cycle_num, pass_num, num_lines, num_pixels) float32 27MB ManifestArray<shape=(1, 10, 9866, 69), dtype=float32, ... time (cycle_num, pass_num, num_lines) float64 789kB ManifestArray<shape=(1, 10, 9866), dtype=float64, chunks=... time_tai (cycle_num, pass_num, num_lines) float64 789kB ManifestArray<shape=(1, 10, 9866), dtype=float64, chunks=... polarization_karin (cycle_num, pass_num, num_lines, num_sides) object 2MB ManifestArray<shape=(1, 10, 9866, 2), dtype=object, ch... Attributes: (12/29) Conventions: CF-1.7 title: Level 2 Low Rate Sea Surface Height Dat... institution: JPL source: Ka-band radar interferometer platform: SWOT reference_document: D-56407_SWOT_Product_Description_L2_LR_SSH ... ... xref_pole_location_file: SMM_PO1_AXXCNE20231122_020000_19900101_... xref_wave_model_files: SMM_WMA_AXPCNE20230727_072014_20230726_... xref_geco_database_version: v102 ellipsoid_semi_major_axis: 6378137.0 ellipsoid_flattening: 0.0033528106647474805 references: V1.2.1