Rich Signell's Notebook Blog

Blogging with IPython Notebooks

CSW Access to NGDC OPeNDAP Endpoints

Test to see if we can SEARCH, ACCESS and USE data using IOOS supported service protocols.

  1. SEARCH a CSW Endpoint to find all datasets that: are containing within a specific bounding box have a specified variable of interest fall withing a specified date range contain a specific data access service type

  2. ACCESS data from the discovered data endpoints

  3. USE the data and see if we can use the data (to the extent of making a time series plot with a labeled x-axis).

In [1]:
from pylab import *
from owslib.csw import CatalogueServiceWeb
from owslib import fes
import random
import netCDF4
import pandas as pd
import datetime as dt

These are tags NGDC would like tested:

apiso:ResponsiblePartyName
apiso:Keywords
apiso:KeywordType
apiso:VertExtent_min
apiso:VertExtent_max
apiso:ServiceType
apiso:ServiceTypeVersion
apiso:OperatesOn
resource.url
website.url
In [2]:
#from IPython.core.display import HTML
#HTML('<iframe src=http://www.ngdc.noaa.gov/geoportal/ width=950 height=400></iframe>')
In [3]:
# connect to CSW, explore it's properties

endpoint = 'http://www.ngdc.noaa.gov/geoportal/csw' # NGDC Geoportal

#endpoint = 'http://www.nodc.noaa.gov/geoportal/csw'   # NODC Geoportal: granule level
#endpoint = 'http://data.nodc.noaa.gov/geoportal/csw'  # NODC Geoportal: collection level   
#endpoint = 'http://geodiscover.cgdi.ca/wes/serviceManagerCSW/csw'  # NRCAN CUSTOM
#endpoint = 'http://geoport.whoi.edu/gi-cat/services/cswiso' # USGS Woods Hole GI_CAT
#endpoint = 'http://cida.usgs.gov/gdp/geonetwork/srv/en/csw' # USGS CIDA Geonetwork
#endpoint = 'http://cmgds.marine.usgs.gov/geonetwork/srv/en/csw' # USGS Coastal and Marine Program
#endpoint = 'http://geoport.whoi.edu/geoportal/csw' # USGS Woods Hole Geoportal 
#endpoint = 'http://geo.gov.ckan.org/csw'  # CKAN testing site for new Data.gov
#endpoint = 'https://edg.epa.gov/metadata/csw'  # EPA
#endpoint = 'http://cwic.csiss.gmu.edu/cwicv1/discovery'  # CWIC

csw = CatalogueServiceWeb(endpoint,timeout=60)
csw.version
Out[3]:
'2.0.2'
In [4]:
[op.name for op in csw.operations]
Out[4]:
['GetCapabilities',
 'DescribeRecord',
 'GetRecords',
 'GetRecordById',
 'Transaction']
In [5]:
# hopefully something like this will be implemented in fes soon
def dateRange(start_date='1900-01-01',stop_date='2100-01-01',constraint='overlaps'):
    if constraint == 'overlaps':
        start = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=stop_date)
        stop = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=start_date)
    elif constraint == 'within':
        start = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:TempExtent_begin', literal=start_date)
        stop = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:TempExtent_end', literal=stop_date)
    return start,stop
In [6]:
# hopefully something like this will be implemented in fes soon
def zRange(min_z='-5000', max_z='0', constraint='overlaps'):
    if constraint == 'overlaps':
        zmin = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:VertExtent_min', literal=min_z)
        zmax = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:VertExtent_max', literal=max_z)
    elif constraint == 'within':
        zmin = fes.PropertyIsGreaterThanOrEqualTo(propertyname='apiso:VertExtent_min', literal=min_z)
        zmax = fes.PropertyIsLessThanOrEqualTo(propertyname='apiso:VertExtent_max', literal=max_z)
    return zmin,zmax
In [7]:
# User Input for query

#box=[-120, 2.0, -110.0, 6.0] #  oceansites
box=[-160, 19, -156, 23]   # pacioos
start_date='2012-05-01'
stop_date='2012-06-01'
min_z = '-5000'
max_z = '0'
responsible_party = 'Margaret McManus'
std_name = 'sea_water_temperature'
service_type='opendap'
In [8]:
# convert User Input into FES filters
start,stop = dateRange(start_date,stop_date)
zmin,zmax = zRange(min_z,max_z,constraint='within')
bbox = fes.BBox(box)
#keywords = fes.PropertyIsLike(propertyname='anyText', literal=std_name)
#keywords = fes.PropertyIsLike(propertyname='apiso:Keywords', literal=std_name)
keywords = fes.PropertyIsEqualTo(propertyname='apiso:Keywords', literal=std_name)
serviceType = fes.PropertyIsLike(propertyname='apiso:ServiceType', literal=('*%s*' % service_type))
ResponsiblePartyName = fes.PropertyIsEqualTo(propertyname='apiso:ResponsiblePartyName', literal=responsible_party)
#serviceType = fes.PropertyIsEqualTo(propertyname='apiso:ServiceType', literal=service_type)
In [9]:
# try simple request using only keywords

csw.getrecords2(constraints=[keywords],maxrecords=5)
csw.records.keys()
Out[9]:
['PWS_L1_FCST',
 'Central and Northern California Ocean Observing System SOS',
 'MB_FCST',
 'PWS_L0_FCST',
 'SCB_DAS']
In [10]:
# try request using multiple filters "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox]],maxrecords=5,esn='full')
csw.records.keys()
Out[10]:
['NS10agg', 'NS01agg', 'NS02agg', 'NS03agg', 'NS04agg']
In [11]:
# try request using multiple filters "and" syntax: [[filter1,filter2]]
csw.getrecords2(constraints=[[keywords,start,stop,bbox]],maxrecords=5,esn='full')
csw.records.keys()
Out[11]:
['National Data Buoy Center SOS',
 'NOAA.NOS.CO-OPS SOS',
 'NS01agg',
 'NS02agg',
 'NS03agg']

In the next cell we take a look at the references for a random record. The ServiceType for the OPeNDAP Data URL below is ServiceType:odp:url, which seems correct!

In [12]:
choice=random.choice(list(csw.records.keys()))
print choice
csw.records[choice].references
National Data Buoy Center SOS

Out[12]:
[{'scheme': 'urn:x-esri:specification:ServiceType:distribution:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'},
 {'scheme': 'urn:x-esri:specification:ServiceType:distribution:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'},
 {'scheme': 'urn:x-esri:specification:ServiceType:sos:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'},
 {'scheme': 'urn:x-esri:specification:ServiceType:sos:url',
  'url': 'http://sdf.ndbc.noaa.gov/sos/server.php?service=SOS&request=GetCapabilities'}]

In the next cell we try using the ResponsiblePartyName, we are looking for Margaret McMannus, who we see in the ISO record for NS10agg here
http://oos.soest.hawaii.edu/thredds/iso/hioos/nss/ns10agg?catalog=http%3A%2F%2Foos.soest.hawaii.edu%2Fthredds%2Fidd%2Fnss_hioos.html&dataset=NS10agg

In [13]:
# try adding responsible party role 
csw.getrecords2(constraints=[ResponsiblePartyName],maxrecords=10,esn='full')
csw.records.keys()
Out[13]:
['NS05agg',
 'NS10agg',
 'NS09agg',
 'NS15agg',
 'NS04agg',
 'NS06agg',
 'NS03agg',
 'NS11agg',
 'NS07agg',
 'NS08agg']

Now print out some titles

In [14]:
for rec,item in csw.records.iteritems():
    print item.title
PacIOOS Nearshore Sensor 05: Pago Pago, American Samoa
PacIOOS Nearshore Sensor 10: Maunalua Bay, Oahu, Hawaii
PacIOOS Nearshore Sensor 09: Cetti Bay, Guam
PacIOOS Nearshore Sensor 15: Pago Bay, Guam
PacIOOS Nearshore Sensor 04: Waikiki Aquarium, Oahu, Hawaii
PacIOOS Nearshore Sensor 06: Pohnpei, Micronesia
PacIOOS Nearshore Sensor 03: Hilton Hawaiian Pier, Oahu, Hawaii
PacIOOS Nearshore Sensor 11: Saipan, CNMI
PacIOOS Nearshore Sensor 07: Majuro, Marshall Islands
PacIOOS Nearshore Sensor 08: Koror, Palau

In the next cell we try using a vertical extent range, which doesn't seem to be working, as we are looking for z values between [-5000, 0] and we see in the ISO record for 'NS10agg' here:
http://oos.soest.hawaii.edu/thredds/iso/hioos/nss/ns10agg?catalog=http%3A%2F%2Foos.soest.hawaii.edu%2Fthredds%2Fidd%2Fnss_hioos.html&dataset=NS10agg
that the vertical extent is from [-2, -2]

In [15]:
csw.getrecords2(constraints=[[keywords,start,stop,serviceType,bbox,zmin,zmax]],maxrecords=5,esn='full')
csw.records.keys()
Out[15]:
['NS10agg', 'NS01agg', 'NS02agg', 'NS03agg', 'NS04agg']

Define a function that will return the endpoint for a specified service type

In [16]:
# get specific ServiceType URL from records
def service_urls(records,service_string='urn:x-esri:specification:ServiceType:odp:url'):
    urls=[]
    for key,rec in records.iteritems():
        #create a generator object, and iterate through it until the match is found
        #if not found, gets the default value (here "none")
        url = next((d['url'] for d in rec.references if d['scheme'] == service_string), None)
        if url is not None:
            urls.append(url)
    return urls

Print out all the OPeNDAP Data URL endpoints

In [17]:
dap_urls = service_urls(csw.records,service_string='urn:x-esri:specification:ServiceType:odp:url')
print "\n".join(dap_urls)
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns10agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns01agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns02agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns03agg
http://oos.soest.hawaii.edu/thredds/dodsC/hioos/nss/ns04agg

In [18]:
from collections import defaultdict

def standard_names(ncv):
    '''
    get dictionary of variables with standard_names
    '''
    d = defaultdict(list)
    for k,v in ncv.iteritems():
        if 'standard_name' in v.ncattrs():
            d[v.standard_name].append(k)
    return d
In [19]:
for url in dap_urls:
    ncv = netCDF4.Dataset(url).variables
    lat = ncv['lon'][:]
    lon = ncv['lat'][:]
    tvar = ncv['time']
    istart = netCDF4.date2index(dt.datetime.strptime(start_date,'%Y-%m-%d'),tvar,select='nearest')
    istop = netCDF4.date2index(dt.datetime.strptime(stop_date,'%Y-%m-%d'),tvar,select='nearest')
    if istart != istop:
        dtime = netCDF4.num2date(tvar[istart:istop],tvar.units)
        # make a dictionary containing all data from variables that matched the standard_name
        # find list of variables for each standard_name
        d = standard_names(ncv)
        # find all the variables matching standard_name=std_name
        print d[std_name]
        # read all the data into a dictionary
        data_dict={}
        lev=0
        for v in d[std_name]:
            data_dict[v]=ncv[v][istart:istop].flatten()
        # Create Pandas data frame, with time index
        ts = pd.DataFrame.from_dict(data_dict)
        ts.index=dtime
        ts.plot(figsize=(12,4));
        title(std_name)
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'turb', u'flor', u'salt', u'pres']
[u'temp']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'turb', u'flor', u'salt', u'pres']
[u'temp']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'turb', u'flor', u'salt']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'salt', u'pres']
[u'temp']
[u'z', u'lat', u'lon', u'time', u'temp', u'cond', u'salt', u'pres']
[u'temp']

In []:
HTML(html)

Comments