import requests
from lxml.html import parse
from io import StringIO

import xarray as xr
from datetime import datetime
import rioxarray # used magically via xr
import dask
library(rvest)
library(stringr)
library(stars)

We extract some list of urls to netcdf files:

url = "https://noaadata.apps.nsidc.org/NOAA/G02202_V4/north/daily/1978/"
r = requests.get(url) 
tree = parse(StringIO(r.text)).getroot()
files = [a.text for a in tree.iter('a')]
sic_files = [i for i in files if 'conc' in i]
sic_urls = [url + s for s in sic_files]
url = "https://noaadata.apps.nsidc.org/NOAA/G02202_V4/north/daily/1978/"
page <- read_html(url)
files <- page %>% html_nodes("a") %>% html_attr("href")
urls <- paste0(url, files)
sic_urls <- urls |> subset(str_detect(urls, "conc"))

We open all these netcdf files lazily over the virtual filesystem, allowing us to efficiently subset just what we need.

cdr = xr.open_mfdataset(sic_urls, 
                        variable="cdr_seaice_conc", 
                        engine="rasterio", 
                        concat_dim="time", 
                        combine="nested")
dates <- stringr::str_extract(sic_urls, "\\d{8}") |> lubridate::ymd()

cdr <- read_stars(paste0("/vsicurl/", sic_urls),
                  "cdr_seaice_conc", 
                  along = list(time = dates), 
                  quiet = TRUE)