Data Download

Contents

Data Download#

Copy datasets from OOINET to localhost.#

The data order is manual. Resulting links are placed in ~/argosy/download_link_list.txt, one URL per line, for example:

https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235914738Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample

Here is what the three cells in this notebook do:

First cell: From links in ~/argosy/download_link_list.txt download source data files
- corresponding localhost folders `~/ooi/ooinet/rca/SlopeBase/{scalar | vector}/_
Second cell: Scan for redundant source data files, step one
- Creates a bash script in ~/argosy that deletes superfluous files
Third cell: Scan remaining files for “minimum cover” files
- This is the minimum number of files that cover the entirety of available time
- Creates a bash script again that deletes superfluous files

#!/usr/bin/env python
# coding: utf-8

# # Data Download
#
#
# This notebook copies datasets from OOINET to localhost.
#
#
# The data order is manual.
# The resulting links are placed in the text file `~/argosy/download_link_list.txt`: One URL per line:
#
#
# ```
# https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260207T235914738Z-RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample
# ```
#
# We want the code in the cell below to read this file and go to each link in succession,
# downloading the data to a corresponding localhost folder.

# In[8]:


import requests
from pathlib import Path
from bs4 import BeautifulSoup
import re

def parse_year_from_filename(filename):
    """Extract year from NetCDF filename."""
    pattern = r'_(\d{4})\d{2}\d{2}T\d{6}\.\d+-\d{8}T\d{6}\.\d+\.nc$'
    match = re.search(pattern, filename)
    if match:
        return int(match.group(1))
    return None

def is_file_complete(filepath):
    """Check if file exists and is non-zero size."""
    if not filepath.exists():
        return False
    return filepath.stat().st_size > 0

def download_file(url, destination):
    """Download a file from URL to destination."""
    temp_file = destination.with_suffix('.tmp')
    try:
        response = requests.get(url, stream=True, timeout=300)
        response.raise_for_status()

        with open(temp_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        temp_file.rename(destination)
        return True
    except Exception as e:
        print(f"    Error: {e}")
        if temp_file.exists():
            temp_file.unlink()
        return False

def bulk_download(instrument="ctd", ooi_instrument="CTDPF"):
    """Bulk download instrument files from URL list with restart tolerance."""

    print(f"Running bulk_download for instrument type = {instrument} (OOI: {ooi_instrument})")

    # Read URL list
    url_list_file = Path("~/argosy/download_link_list.txt").expanduser()
    if not url_list_file.exists():
        print(f"File not found: {url_list_file}")
        return

    with open(url_list_file, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]

    print(f"Found {len(urls)} URLs to process\n")

    # Base folder for all instrument/year source data
    base_folder = Path("~/ooi/ooinet/rca/SlopeBase/scalar").expanduser()

    if not base_folder.exists():
        print(f"Base folder does not exist: {base_folder}")
        return

    # Create year folders as needed
    for year in range(2014, 2027):
        year_folder = base_folder / f"{year}_{instrument}"
        year_folder.mkdir(exist_ok=True)

    total_downloaded = 0
    total_skipped    = 0
    total_complete   = 0

    for url_idx, url in enumerate(urls, 1):
        print(f"=== URL {url_idx}/{len(urls)} ===")
        print(f"{url}")

        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()

            soup  = BeautifulSoup(response.text, 'html.parser')
            links = soup.find_all('a')

            nc_files = [
                link.get('href', '') for link in links
                if link.get('href', '').endswith('.nc')
                and not link.get('href', '').endswith('.ncml')
                and ooi_instrument in link.get('href', '')
            ]

            already_downloaded = 0
            to_download = []

            for filename in nc_files:
                year = parse_year_from_filename(filename)
                if year is None:
                    continue
                dest_file = base_folder / f"{year}_{instrument}" / filename
                if is_file_complete(dest_file):
                    already_downloaded += 1
                else:
                    to_download.append((filename, year))

            print(f"  Total .nc files:       {len(nc_files)}")
            print(f"  Already downloaded:    {already_downloaded}")
            print(f"  Remaining to download: {len(to_download)}")

            total_complete += already_downloaded

            if not to_download:
                print(f"  All files complete, skipping\n")
                continue

            for file_idx, (filename, year) in enumerate(to_download, 1):
                dest_file = base_folder / f"{year}_{instrument}" / filename
                file_url  = url.rstrip('/') + '/' + filename
                print(f"  [{file_idx}/{len(to_download)}] {filename} -> {year}_{instrument}/")

                if download_file(file_url, dest_file):
                    total_downloaded += 1
                    print(f"    Complete: {dest_file.stat().st_size / (1024*1024):.1f} MB")
                else:
                    total_skipped += 1

            print()

        except Exception as e:
            print(f"  Error processing URL: {e}\n")
            continue

    print(f"=== Download Summary ===")
    print(f"Files already complete:  {total_complete}")
    print(f"Files newly downloaded:  {total_downloaded}")
    print(f"Files failed/skipped:    {total_skipped}")

    print("\nTotal files by year:")
    for year in range(2014, 2027):
        year_folder = base_folder / f"{year}_{instrument}"
        if year_folder.exists():
            count = len(list(year_folder.glob("*.nc")))
            if count > 0:
                print(f"  {year}: {count} files")


# Instrument key -> OOI instrument code mapping (from Sensor Table):
#   ctd   -> CTDPF
#   flor  -> FLORT
#   ph    -> PHSEN
#   pco2  -> PCO2W
#   nitr  -> NUTNR
#   par   -> PARAD
#   vel   -> VELPT
#   irr   -> SPKIR
#   oa/ba -> OPTAA

# Run the bulk download - uncomment as needed:
bulk_download("ctd",  "CTDPF")
# bulk_download("flor", "FLORT")
# bulk_download("ph",   "PHSEN")
# bulk_download("pco2", "PCO2W")
# bulk_download("nitr", "NUTNR")
# bulk_download("par",  "PARAD")
# bulk_download("vel",  "VELPT")
# bulk_download("irr",  "SPKIR")

Running bulk_download for instrument type = ctd (OOI: CTDPF)
Found 3 URLs to process

=== URL 1/3 ===
https://downloads.oceanobservatories.org/async_results/kilroy1618@gmail.com/20260302T160213452Z-RS01SBPS-SF01A-2D-PHSENA101-streamed-phsen_data_record

  Total .nc files:       31
  Already downloaded:    9
  Remaining to download: 22
  [1/22] deployment0001_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20141231T235959.533995-20150424T172450.573911.nc -> 2014_ctd/

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[1], line 178
    163                 print(f"  {year}: {count} files")
    166 # Instrument key -> OOI instrument code mapping (from Sensor Table):
    167 #   ctd   -> CTDPF
    168 #   flor  -> FLORT
   (...)
    176 
    177 # Run the bulk download - uncomment as needed:
--> 178 bulk_download("ctd",  "CTDPF")

Cell In[1], line 140, in bulk_download(instrument, ooi_instrument)
    137 file_url  = url.rstrip('/') + '/' + filename
    138 print(f"  [{file_idx}/{len(to_download)}] {filename} -> {year}_{instrument}/")
--> 140 if download_file(file_url, dest_file):
    141     total_downloaded += 1
    142     print(f"    Complete: {dest_file.stat().st_size / (1024*1024):.1f} MB")

Cell In[1], line 51, in download_file(url, destination)
     48 response.raise_for_status()
     50 with open(temp_file, 'wb') as f:
---> 51     for chunk in response.iter_content(chunk_size=8192):
     52         f.write(chunk)
     54 temp_file.rename(destination)

File ~/miniconda3/envs/argo-env2/lib/python3.11/site-packages/requests/models.py:816, in Response.iter_content.<locals>.generate()
    814 if hasattr(self.raw, "stream"):
    815     try:
--> 816         yield from self.raw.stream(chunk_size, decode_content=True)
    817     except ProtocolError as e:
    818         raise ChunkedEncodingError(e)

File ~/miniconda3/envs/argo-env2/lib/python3.11/site-packages/urllib3/response.py:628, in HTTPResponse.stream(self, amt, decode_content)
    626 else:
    627     while not is_fp_closed(self._fp):
--> 628         data = self.read(amt=amt, decode_content=decode_content)
    630         if data:
    631             yield data

File ~/miniconda3/envs/argo-env2/lib/python3.11/site-packages/urllib3/response.py:567, in HTTPResponse.read(self, amt, decode_content, cache_content)
    564 fp_closed = getattr(self._fp, "closed", False)
    566 with self._error_catcher():
--> 567     data = self._fp_read(amt) if not fp_closed else b""
    568     if amt is None:
    569         flush_decoder = True

File ~/miniconda3/envs/argo-env2/lib/python3.11/site-packages/urllib3/response.py:533, in HTTPResponse._fp_read(self, amt)
    530     return buffer.getvalue()
    531 else:
    532     # StringIO doesn't like amt=None
--> 533     return self._fp.read(amt) if amt is not None else self._fp.read()

File ~/miniconda3/envs/argo-env2/lib/python3.11/http/client.py:466, in HTTPResponse.read(self, amt)
    463 if self.length is not None and amt > self.length:
    464     # clip the read to the "end of response"
    465     amt = self.length
--> 466 s = self.fp.read(amt)
    467 if not s and amt:
    468     # Ideally, we would raise IncompleteRead if the content-length
    469     # wasn't satisfied, but it might break compatibility.
    470     self._close_conn()

File ~/miniconda3/envs/argo-env2/lib/python3.11/socket.py:706, in SocketIO.readinto(self, b)
    704 while True:
    705     try:
--> 706         return self._sock.recv_into(b)
    707     except timeout:
    708         self._timeout_occurred = True

File ~/miniconda3/envs/argo-env2/lib/python3.11/ssl.py:1278, in SSLSocket.recv_into(self, buffer, nbytes, flags)
   1274     if flags != 0:
   1275         raise ValueError(
   1276           "non-zero flags not allowed in calls to recv_into() on %s" %
   1277           self.__class__)
-> 1278     return self.read(nbytes, buffer)
   1279 else:
   1280     return super().recv_into(buffer, nbytes, flags)

File ~/miniconda3/envs/argo-env2/lib/python3.11/ssl.py:1134, in SSLSocket.read(self, len, buffer)
   1132 try:
   1133     if buffer is not None:
-> 1134         return self._sslobj.read(len, buffer)
   1135     else:
   1136         return self._sslobj.read(len)

KeyboardInterrupt: 

# Jupyter cell: CTD file overlap audit
# Identifies redundant CTD files and plots time coverage bars
# This is part one: It identifies files that are entirely overlapped by other files and adds them
#   to a deletion script. After running this script there should be no more red files; just yellow
#   and possibly blue. See the next cell for a further cleaning step.

import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path
from datetime import datetime

print("CTD File Overlap Audit")
print("=" * 60)

BASE = Path("~/ooi/ooinet/rca/SlopeBase/scalar").expanduser()
YEARS = range(2014, 2027)

# --- Parse time range from filename ---
def parse_time_range(filename):
    """Extract start and end datetime from CTD filename."""
    pattern = r'_(\d{8}T\d{6})\.\d+-(\d{8}T\d{6})\.\d+\.nc$'
    match = re.search(pattern, filename)
    if not match:
        return None, None
    fmt = "%Y%m%dT%H%M%S"
    try:
        start = datetime.strptime(match.group(1), fmt)
        end   = datetime.strptime(match.group(2), fmt)
        return start, end
    except ValueError:
        return None, None

# --- Collect all CTD files ---
all_files = []
for year in YEARS:
    folder = BASE / f"{year}_ctd"
    if not folder.exists():
        continue
    for f in sorted(folder.glob("*.nc")):
        start, end = parse_time_range(f.name)
        if start and end:
            all_files.append({'name': f.name, 'path': f, 'start': start, 'end': end, 'year': year})

print(f"Total CTD files found: {len(all_files)}")

# --- Detect overlaps ---
# Sort by start time
all_files.sort(key=lambda x: x['start'])

overlaps = []
for i in range(len(all_files)):
    for j in range(i + 1, len(all_files)):
        a = all_files[i]
        b = all_files[j]
        # If b starts after a ends, no overlap possible for any later j
        if b['start'] >= a['end']:
            break
        # Overlap: b starts before a ends
        overlap_start = max(a['start'], b['start'])
        overlap_end   = min(a['end'],   b['end'])
        overlap_days  = (overlap_end - overlap_start).total_seconds() / 86400
        overlaps.append({
            'file_a': a['name'],
            'file_b': b['name'],
            'overlap_start': overlap_start,
            'overlap_end':   overlap_end,
            'overlap_days':  overlap_days
        })

print(f"Overlapping file pairs: {len(overlaps)}")

if overlaps:
    print(f"\nOverlap details:")
    for ov in overlaps:
        print(f"\n  A: {ov['file_a']}")
        print(f"  B: {ov['file_b']}")
        print(f"  Overlap: {ov['overlap_start'].date()} to {ov['overlap_end'].date()} ({ov['overlap_days']:.1f} days)")

# --- Identify redundant files ---
# A file is redundant if its entire time range is covered by another file
redundant = []
for i, f in enumerate(all_files):
    for j, g in enumerate(all_files):
        if i == j:
            continue
        # f is fully contained within g
        if g['start'] <= f['start'] and g['end'] >= f['end'] and f['name'] != g['name']:
            redundant.append({'redundant': f, 'covered_by': g})
            break

print(f"\nFully redundant files (entirely covered by another): {len(redundant)}")
for r in redundant:
    print(f"  REDUNDANT: {r['redundant']['name']}")
    print(f"  COVERED BY: {r['covered_by']['name']}")

# --- Strategy recommendation ---
print(f"\n{'=' * 60}")
print("STRATEGY RECOMMENDATION")
print(f"{'=' * 60}")

script_path = Path("~/argosy/delete_overlaps.sh").expanduser()

if len(overlaps) == 0:
    print("No overlaps detected - CTD files appear clean.")
    if script_path.exists():
        script_path.unlink()
        print("(Removed stale delete_overlaps.sh)")
elif len(redundant) > 0:
    total_size = sum(r['redundant']['path'].stat().st_size for r in redundant) / (1024*1024)
    print(f"Delete {len(redundant)} fully redundant files ({total_size:.1f} MB recoverable)")
    print("For partial overlaps: Keep the file with the longer time span.")

    # Write shell script
    lines = [
        "#!/bin/bash",
        "# Auto-generated by ctd_overlap_audit.py",
        "# Deletes fully redundant CTD files (time range entirely covered by another file)",
        "",
    ]
    for r in redundant:
        lines.append(f"# Covered by: {r['covered_by']['name']}")
        lines.append(f"rm \"{r['redundant']['path']}\"")
        lines.append("")

    script_path.write_text("\n".join(lines))
    script_path.chmod(0o755)
    print(f"\nDeletion script written to: {script_path}")
    print(f"Review and run with:  bash {script_path}")
else:
    print(f"{len(overlaps)} partial overlaps found but no fully redundant files.")
    print("For each overlapping pair: Keep the file with the longer time span.")
    print("Partial overlaps may require trimming rather than deletion.")

# --- Plot: time coverage bars ---
print(f"\nGenerating coverage plot...")

fig, ax = plt.subplots(figsize=(14, max(4, len(all_files) * 0.25)))

# Color: red if redundant, orange if overlapping, blue otherwise
redundant_names = {r['redundant']['name'] for r in redundant}
overlap_names   = {ov['file_a'] for ov in overlaps} | {ov['file_b'] for ov in overlaps}

for i, f in enumerate(all_files):
    if f['name'] in redundant_names:
        color = 'red'
        alpha = 0.8
    elif f['name'] in overlap_names:
        color = 'orange'
        alpha = 0.7
    else:
        color = 'steelblue'
        alpha = 0.6

    ax.barh(i, (f['end'] - f['start']).total_seconds() / 86400,
            left=mdates.date2num(f['start']),
            height=0.7, color=color, alpha=alpha, edgecolor='none')

# Format x-axis as dates
ax.xaxis_date()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_major_locator(mdates.YearLocator())

ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('File index (chronological)', fontsize=12)
ax.set_title(f'CTD File Time Coverage ({len(all_files)} files)\n'
             f'Blue=clean  Orange=overlapping  Red=redundant', fontsize=13)
ax.set_xlim(mdates.date2num(datetime(2014, 1, 1)),
            mdates.date2num(datetime(2026, 12, 31)))
ax.grid(True, axis='x', alpha=0.3)
ax.set_yticks([])

# Legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='steelblue', alpha=0.6, label='Clean'),
    Patch(facecolor='orange',    alpha=0.7, label='Overlapping'),
    Patch(facecolor='red',       alpha=0.8, label='Redundant (fully covered)'),
]
ax.legend(handles=legend_elements, loc='upper left')

plt.tight_layout()
plt.savefig(Path("~/argosy/ctd_coverage.png").expanduser(), dpi=150, bbox_inches='tight')
plt.show()
print("Plot saved to ~/argosy/ctd_coverage.png")

CTD File Overlap Audit
============================================================
Total CTD files found: 40
Overlapping file pairs: 48

Overlap details:

  A: deployment0001_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20141231T235959.533995-20150518T000000.646750.nc
  B: deployment0001_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20150424T172451.574121-20150705T235959.089334.nc
  Overlap: 2015-04-24 to 2015-05-18 (23.3 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20150708T220539.700577-20151130T000000.854900.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151107T011757.231076-20160203T075108.287602.nc
  Overlap: 2015-11-07 to 2015-11-30 (22.9 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20150708T220539.700577-20151130T000000.854900.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151112T235959.387746-20160301T180000.653717.nc
  Overlap: 2015-11-12 to 2015-11-30 (17.0 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20150708T220539.700577-20151130T000000.854900.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151129T235959.854788-20160324T050505.085704.nc
  Overlap: 2015-11-29 to 2015-11-30 (0.0 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151107T011757.231076-20160203T075108.287602.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151112T235959.387746-20160301T180000.653717.nc
  Overlap: 2015-11-12 to 2016-02-03 (82.3 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151107T011757.231076-20160203T075108.287602.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151129T235959.854788-20160324T050505.085704.nc
  Overlap: 2015-11-29 to 2016-02-03 (65.3 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151112T235959.387746-20160301T180000.653717.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151129T235959.854788-20160324T050505.085704.nc
  Overlap: 2015-11-29 to 2016-03-01 (92.8 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151112T235959.387746-20160301T180000.653717.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160203T075109.288025-20160516T000000.559495.nc
  Overlap: 2016-02-03 to 2016-03-01 (27.4 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151112T235959.387746-20160301T180000.653717.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160301T175959.653501-20160622T120000.737566.nc
  Overlap: 2016-03-01 to 2016-03-01 (0.0 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151129T235959.854788-20160324T050505.085704.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160203T075109.288025-20160516T000000.559495.nc
  Overlap: 2016-02-03 to 2016-03-24 (49.9 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20151129T235959.854788-20160324T050505.085704.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160301T175959.653501-20160622T120000.737566.nc
  Overlap: 2016-03-01 to 2016-03-24 (22.5 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160203T075109.288025-20160516T000000.559495.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160301T175959.653501-20160622T120000.737566.nc
  Overlap: 2016-03-01 to 2016-05-16 (75.3 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160203T075109.288025-20160516T000000.559495.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160324T050506.085710-20160711T000000.025662.nc
  Overlap: 2016-03-24 to 2016-05-16 (52.8 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160203T075109.288025-20160516T000000.559495.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160515T235959.558758-20160716T111049.607585.nc
  Overlap: 2016-05-15 to 2016-05-16 (0.0 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160301T175959.653501-20160622T120000.737566.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160324T050506.085710-20160711T000000.025662.nc
  Overlap: 2016-03-24 to 2016-06-22 (90.3 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160301T175959.653501-20160622T120000.737566.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160515T235959.558758-20160716T111049.607585.nc
  Overlap: 2016-05-15 to 2016-06-22 (37.5 days)

  A: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160324T050506.085710-20160711T000000.025662.nc
  B: deployment0002_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20160515T235959.558758-20160716T111049.607585.nc
  Overlap: 2016-05-15 to 2016-07-11 (56.0 days)

  A: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20180710T170527.140820-20181217T000000.485697.nc
  B: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20181125T055959.112187-20190311T180000.560954.nc
  Overlap: 2018-11-25 to 2018-12-17 (21.8 days)

  A: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20180710T170527.140820-20181217T000000.485697.nc
  B: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20181125T235959.746938-20190318T000000.207331.nc
  Overlap: 2018-11-25 to 2018-12-17 (21.0 days)

  A: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20181125T055959.112187-20190311T180000.560954.nc
  B: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20181125T235959.746938-20190318T000000.207331.nc
  Overlap: 2018-11-25 to 2019-03-11 (105.8 days)

  A: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20181125T055959.112187-20190311T180000.560954.nc
  B: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190311T175959.560630-20190614T051242.692351.nc
  Overlap: 2019-03-11 to 2019-03-11 (0.0 days)

  A: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20181125T235959.746938-20190318T000000.207331.nc
  B: deployment0006_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20190311T175959.560630-20190614T051242.692351.nc
  Overlap: 2019-03-11 to 2019-03-18 (6.3 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20200803T164413.888856-20201130T225829.341226.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201122T235959.198893-20210315T000000.016139.nc
  Overlap: 2020-11-22 to 2020-11-30 (8.0 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20200803T164413.888856-20201130T225829.341226.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201127T235959.147644-20210319T120000.431316.nc
  Overlap: 2020-11-27 to 2020-11-30 (3.0 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201122T235959.198893-20210315T000000.016139.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201127T235959.147644-20210319T120000.431316.nc
  Overlap: 2020-11-27 to 2021-03-15 (107.0 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201122T235959.198893-20210315T000000.016139.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210314T235959.016346-20210705T000000.595883.nc
  Overlap: 2021-03-14 to 2021-03-15 (0.0 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201127T235959.147644-20210319T120000.431316.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210314T235959.016346-20210705T000000.595883.nc
  Overlap: 2021-03-14 to 2021-03-19 (4.5 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20201127T235959.147644-20210319T120000.431316.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210317T085642.860957-20210804T190841.180708.nc
  Overlap: 2021-03-17 to 2021-03-19 (2.1 days)

  A: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210314T235959.016346-20210705T000000.595883.nc
  B: deployment0008_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210317T085642.860957-20210804T190841.180708.nc
  Overlap: 2021-03-17 to 2021-07-05 (109.6 days)

  A: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210824T233256.569856-20211223T063248.370115.nc
  B: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211217T175959.876304-20220405T180000.708918.nc
  Overlap: 2021-12-17 to 2021-12-23 (5.5 days)

  A: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20210824T233256.569856-20211223T063248.370115.nc
  B: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211219T235959.593037-20220411T000000.661721.nc
  Overlap: 2021-12-19 to 2021-12-23 (3.3 days)

  A: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211217T175959.876304-20220405T180000.708918.nc
  B: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211219T235959.593037-20220411T000000.661721.nc
  Overlap: 2021-12-19 to 2022-04-05 (106.8 days)

  A: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211217T175959.876304-20220405T180000.708918.nc
  B: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211223T063249.370230-20220902T204459.535113.nc
  Overlap: 2021-12-23 to 2022-04-05 (103.5 days)

  A: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211219T235959.593037-20220411T000000.661721.nc
  B: deployment0009_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20211223T063249.370230-20220902T204459.535113.nc
  Overlap: 2021-12-23 to 2022-04-11 (108.7 days)

  A: deployment0010_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20220915T181349.594332-20230116T000000.532180.nc
  B: deployment0010_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230101T000000.149650-20230120T115959.534201.nc
  Overlap: 2023-01-01 to 2023-01-16 (15.0 days)

  A: deployment0010_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20220915T181349.594332-20230116T000000.532180.nc
  B: deployment0010_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230105T055959.511165-20230224T084327.668331.nc
  Overlap: 2023-01-05 to 2023-01-16 (10.8 days)

  A: deployment0010_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230101T000000.149650-20230120T115959.534201.nc
  B: deployment0010_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230105T055959.511165-20230224T084327.668331.nc
  Overlap: 2023-01-05 to 2023-01-20 (15.2 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230817T212929.521060-20240212T000000.874083.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20231228T055959.503935-20240417T120000.203436.nc
  Overlap: 2023-12-28 to 2024-02-12 (45.8 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230817T212929.521060-20240212T000000.874083.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240125T041622.072379-20240506T000000.030980.nc
  Overlap: 2024-01-25 to 2024-02-12 (17.8 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20230817T212929.521060-20240212T000000.874083.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240211T235959.873971-20240702T063319.865896.nc
  Overlap: 2024-02-11 to 2024-02-12 (0.0 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20231228T055959.503935-20240417T120000.203436.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240125T041622.072379-20240506T000000.030980.nc
  Overlap: 2024-01-25 to 2024-04-17 (83.3 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20231228T055959.503935-20240417T120000.203436.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240211T235959.873971-20240702T063319.865896.nc
  Overlap: 2024-02-11 to 2024-04-17 (65.5 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20231228T055959.503935-20240417T120000.203436.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240417T115959.202701-20240809T063045.498432.nc
  Overlap: 2024-04-17 to 2024-04-17 (0.0 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240125T041622.072379-20240506T000000.030980.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240211T235959.873971-20240702T063319.865896.nc
  Overlap: 2024-02-11 to 2024-05-06 (84.0 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240125T041622.072379-20240506T000000.030980.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240417T115959.202701-20240809T063045.498432.nc
  Overlap: 2024-04-17 to 2024-05-06 (18.5 days)

  A: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240211T235959.873971-20240702T063319.865896.nc
  B: deployment0011_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240417T115959.202701-20240809T063045.498432.nc
  Overlap: 2024-04-17 to 2024-07-02 (75.8 days)

  A: deployment0012_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20240809T203132.739326-20241216T000000.135762.nc
  B: deployment0012_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20241122T072830.648546-20250805T142747.771470.nc
  Overlap: 2024-11-22 to 2024-12-16 (23.7 days)

  A: deployment0014_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20250814T232320.067702-20251215T000000.085007.nc
  B: deployment0014_RS01SBPS-SF01A-2A-CTDPFA102-streamed-ctdpf_sbe43_sample_20251123T101006.524654-20260101T000000.312004.nc
  Overlap: 2025-11-23 to 2025-12-15 (21.6 days)

Fully redundant files (entirely covered by another): 0

============================================================
STRATEGY RECOMMENDATION
============================================================
48 partial overlaps found but no fully redundant files.
For each overlapping pair: Keep the file with the longer time span.
Partial overlaps may require trimming rather than deletion.

Generating coverage plot...

../_images/8d18d51f525b357c95fa38586e35b6196a87fc4f3447884f59409c2903c5811e.png

Plot saved to ~/argosy/ctd_coverage.png

# Jupyter cell: CTD minimum cover analysis
# Finds the minimal set of files that preserves full time coverage,
# then generates a deletion script for all others.

import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from pathlib import Path
from datetime import datetime

print("CTD Minimum Cover Analysis")
print("=" * 60)

BASE = Path("~/ooi/ooinet/rca/SlopeBase/scalar").expanduser()
YEARS = range(2014, 2027)

def parse_time_range(filename):
    pattern = r'_(\d{8}T\d{6})\.\d+-(\d{8}T\d{6})\.\d+\.nc$'
    match = re.search(pattern, filename)
    if not match:
        return None, None
    fmt = "%Y%m%dT%H%M%S"
    try:
        return datetime.strptime(match.group(1), fmt), datetime.strptime(match.group(2), fmt)
    except ValueError:
        return None, None

# --- Collect all CTD files ---
all_files = []
for year in YEARS:
    folder = BASE / f"{year}_ctd"
    if not folder.exists():
        continue
    for f in sorted(folder.glob("*.nc")):
        start, end = parse_time_range(f.name)
        if start and end:
            all_files.append({'name': f.name, 'path': f, 'start': start, 'end': end})

all_files.sort(key=lambda x: x['start'])
print(f"Total CTD files: {len(all_files)}")

# --- Greedy interval cover algorithm ---
# At each step: among all files whose start <= current frontier,
# pick the one with the furthest end time. Advance frontier. Repeat.
# This is the classic greedy minimum interval cover and is optimal.

def minimum_cover(files):
    """
    Returns the minimal subset of files that covers the same
    total time span as all files combined, with no gaps.
    Files must be sorted by start time.
    """
    if not files:
        return [], []

    keep = []
    deletable = []

    # Work through the timeline left to right
    frontier = files[0]['start']   # earliest start
    end_of_all = max(f['end'] for f in files)

    remaining = list(files)

    while frontier < end_of_all:
        # Candidates: all files that start at or before the current frontier
        candidates = [f for f in remaining if f['start'] <= frontier]

        if not candidates:
            # Gap in coverage - advance frontier to next file start
            next_file = min(remaining, key=lambda x: x['start'])
            frontier = next_file['start']
            candidates = [next_file]

        # Pick the candidate with the furthest end time
        best = max(candidates, key=lambda x: x['end'])
        keep.append(best)
        remaining.remove(best)

        # Everything else that started before or at frontier and ends
        # before or at best.end is now redundant given best
        newly_redundant = [
            f for f in remaining
            if f['start'] <= frontier and f['end'] <= best['end']
        ]
        for f in newly_redundant:
            deletable.append(f)
            remaining.remove(f)

        frontier = best['end']

    # Any remaining files not yet classified
    for f in remaining:
        # Check if covered by the keep set
        covered = any(k['start'] <= f['start'] and k['end'] >= f['end'] for k in keep)
        if covered:
            deletable.append(f)
        else:
            keep.append(f)

    return keep, deletable

keep, deletable = minimum_cover(all_files)

print(f"Files needed for full coverage: {len(keep)}")
print(f"Files deletable without losing coverage: {len(deletable)}")

if deletable:
    total_size = sum(f['path'].stat().st_size for f in deletable) / (1024*1024)
    print(f"Recoverable space: {total_size:.1f} MB")

# --- Write deletion script ---
script_path = Path("~/argosy/delete_overlaps2.sh").expanduser()

if deletable:
    lines = [
        "#!/bin/bash",
        "# Auto-generated by ctd_minimum_cover.py",
        "# Deletes CTD files whose coverage is fully provided by other files.",
        "# The remaining files preserve complete time coverage.",
        "",
    ]
    for f in sorted(deletable, key=lambda x: x['start']):
        lines.append(f"rm \"{f['path']}\"")

    script_path.write_text("\n".join(lines))
    script_path.chmod(0o755)
    print(f"\nDeletion script written to: {script_path}")
    print(f"Review and run with:  bash {script_path}")
else:
    print("\nNo deletable files found - set is already minimal.")

# --- Plot ---
fig, ax = plt.subplots(figsize=(14, max(4, len(all_files) * 0.25)))

keep_names = {f['name'] for f in keep}

for i, f in enumerate(all_files):
    color = 'steelblue' if f['name'] in keep_names else 'orange'
    alpha = 0.7
    ax.barh(i,
            (f['end'] - f['start']).total_seconds() / 86400,
            left=mdates.date2num(f['start']),
            height=0.7, color=color, alpha=alpha, edgecolor='none')

ax.xaxis_date()
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('File index (chronological)', fontsize=12)
ax.set_title(f'CTD Minimum Cover ({len(all_files)} files)\n'
             f'Blue=keep ({len(keep)})  Orange=deletable ({len(deletable)})', fontsize=13)
ax.set_xlim(mdates.date2num(datetime(2014, 1, 1)),
            mdates.date2num(datetime(2026, 12, 31)))
ax.grid(True, axis='x', alpha=0.3)
ax.set_yticks([])

from matplotlib.patches import Patch
ax.legend(handles=[
    Patch(facecolor='steelblue', alpha=0.7, label=f'Keep ({len(keep)})'),
    Patch(facecolor='orange',    alpha=0.7, label=f'Deletable ({len(deletable)})'),
], loc='upper left')

plt.tight_layout()
plot_path = Path("~/argosy/ctd_minimum_cover.png").expanduser()
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
plt.show()
print(f"Plot saved to {plot_path}")

CTD Minimum Cover Analysis
============================================================
Total CTD files: 29
Files needed for full coverage: 29
Files deletable without losing coverage: 0

No deletable files found - set is already minimal.

../_images/c52b421fe7b22c1ab4b0edd51b3271e09710aeb3d7399bea2bafe3eae0543d69.png

Plot saved to /home/rob/argosy/ctd_minimum_cover.png