#
# IGRA v2.2 Weather Balloon Analysis
#
# This R routine reads the original IGRA2 station data
# and creates an output file that is easier to process
# It reads in *-data.txt files and produces *_records.csv files
# The *_records files go into Analysis/Records.
#
# Data from: ftp.ncei.noaa.gov/pub/data/igra
#
# Reference for molar density calculation: Connolly, et al., 2021,
# Analyzing Atmospheric Circulation Patterns Using Mass Fluxes Calculated
# from Weather Balloon Measurements: North Atlantic Region as a Case Study
#
# © Copyright (Andy May) 2025, 9/14/2025
#
# Install data.table if not already installed
# install.packages("data.table")

# Root directory, set appropriately for your system
root_dir <- "d:/Climate_Change/weather_balloons/NOAA/Analysis/"
setwd(root_dir)
getwd()
gc(full = TRUE)

output_dir <- paste0(root_dir, "Records/")
   if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)


library(sp)
library(classInt)
library(readr)
library(weathermetrics)
library(data.table)  # Added for faster data handling and writing

file_list <- readLines('file_lists/dir.lst')
#
# Work in directory with the downloaded IGRA2 files
setwd("d:/Climate_Change/weather_balloons/NOAA/data/")

# Begin to loop through IGRA2 files
for (j in 1:length(file_list)) {
  
  # Read the file
  data_rec <- readLines(file_list[j])
  
  # Initialize empty list to collect records for this file
  records <- list()
  
  # Use a while loop for more control over skipping
  i <- 1
  while (i <= length(data_rec)) {
    line <- data_rec[i]
    
    if (substr(line, 1, 1) == '#') {
      # Station record
      id <- as.character(substr(line, 2, 12))
      year <- as.integer(substr(line, 14, 17))
      month <- as.integer(substr(line, 19, 20))
      day <- as.integer(substr(line, 22, 23))
      hour <- as.integer(substr(line, 25, 26))
      if (hour == 99) hour <- NA
      reltime <- as.integer(substr(line, 28, 31))
      if (reltime == 9999) reltime <- NA
      numlev <- as.integer(substr(line, 33, 36))
      p_src <- as.character(substr(line, 38, 45))
      if (p_src == "        ") p_src <- NA  # Adjusted for 8 spaces
      np_src <- as.character(substr(line, 47, 54))
      if (np_src == "        ") np_src <- NA  # Adjusted for 8 spaces
      Ilat <- as.integer(substr(line, 56, 62))
      lat <- as.double(Ilat / 10000)
      Ilong <- as.integer(substr(line, 64, 71))
      long <- as.double(Ilong / 10000)
      
      # Check condition early to skip unnecessary processing
      # Only ascents with >90 pressure levels and since 1990 are accepted
      if (numlev > 90 && year > 1990) {
        process_launch <- TRUE
      } else {
        process_launch <- FALSE
      }
      
      i <- i + 1  # Move to next line
      
      # Now process or skip the next numlev lines
      if (!process_launch) {
        # Skip the data lines
        i <- i + numlev
        next
      }
      
      # Now that we have an acceptable ascent, process the data lines
      for (k in 1:numlev) {
        if (i > length(data_rec)) break  # Safety check
        
        data_line <- data_rec[i]
        
        # Data record
        typ1 <- as.integer(substr(data_line, 1, 1))
        typ2 <- as.integer(substr(data_line, 2, 2))
        etime <- as.integer(substr(data_line, 4, 8))
        if (etime == -9999) etime <- NA
        press <- as.integer(substr(data_line, 10, 15))
        if (press == -9999) press <- NA
        hPa <- if (is.na(press)) NA else press / 100
        p_flag <- as.character(substr(data_line, 16, 16))
        gph <- as.integer(substr(data_line, 17, 21))
        if (gph == -9999 | gph == -8888) gph <- NA
        z_flag <- as.character(substr(data_line, 22, 22))
        Itemp <- as.integer(substr(data_line, 23, 27))
        if (Itemp == -9999 | Itemp == -8888) Itemp <- NA
        temp <- if (is.na(Itemp)) NA else Itemp / 10
        t_flag <- as.character(substr(data_line, 28, 28))
        IRH <- as.integer(substr(data_line, 29, 33))
        if (IRH == -9999 | IRH == -8888) IRH <- NA
        rh <- if (is.na(IRH)) NA else IRH / 10
        IDP <- as.integer(substr(data_line, 35, 39))
        if (IDP == -9999 | IDP == -8888) IDP <- NA
        dpd <- if (is.na(IDP)) NA else IDP / 10
        wdir <- as.integer(substr(data_line, 41, 45))
        if (wdir == -9999 | wdir == -8888) wdir <- NA
        Iwspd <- as.integer(substr(data_line, 47, 51))
        if (Iwspd == -9999 | Iwspd == -8888) Iwspd <- NA
        wspd <- if (is.na(Iwspd)) NA else Iwspd / 10
        
        # Molar Density
        mol_den <- if (is.na(hPa) | is.na(temp)) NA else (hPa * 100) / ((temp + 273.15) * 8.3145)
        
        # rh from dpd (dew point depression) if rh is NA (null)
        # The dewpoint is equal to the air temperature - dewpoint depression
        if (is.na(rh) && !is.na(dpd) && !is.na(temp)) {
          rh <- dewpoint.to.humidity(t = temp, dp = (temp - dpd), 
                  temperature.metric = 'celsius')
        }
        
        # Compute q_gkg (specific humidity)
         t_k = temp + 273.15  # Convert to K
         es = 6.1078 * exp((17.269388 * (t_k - 273.16)) / (t_k - 35.86)) #Magnus-Tetens eq. per Murray, 1967 
         e = (rh / 100) * es
         q_gkg = ifelse(hPa > e & !is.na(temp) & 
                  !is.na(rh) & !is.na(hPa), 
                   0.622 * e / (hPa - e) * 1000, NA_real_)

        
        # Collect as named list
        records[[length(records) + 1]] <- list(
          id = id, year = year, month = month, day = day, hour = hour, reltime = reltime,
          numlev = numlev, p_src = p_src, np_src = np_src, lat = lat, long = long,
          typ1 = typ1, typ2 = typ2, etime = etime, hPa = hPa, p_flag = p_flag,
          gph = gph, z_flag = z_flag, temp = temp, t_flag = t_flag, mol_den = mol_den,
          rh = rh, dpd = dpd, wdir = wdir, wspd = wspd, q_gkg = q_gkg
        )
        
        i <- i + 1
      }
    } else {
      # If not a header (should not happen if data is well-formed), skip
      i <- i + 1
    }
  }  # End while loop
  
  # After processing the file, write if there are records
  if (length(records) > 0) {
    dt <- rbindlist(records)
    fwrite(dt, file = paste0(output_dir, id, "_records.csv"), sep = ",", 
           quote = FALSE, row.names = FALSE)
  }
  
  # Clean up memory
  rm(records, data_rec, dt)
  gc(full = TRUE)
}  # End j (file reading) loop