#
# IGRA v2.2 Weather Balloon Analysis
#
# Data: ftp.ncei.noaa.gov/pub/data/irga
# 
#
#
# This program reads the monthly datasets built by MoLat.r in Month_records
# and makes binned plots of average values and stats 
# by latitude slice and month
#
# IGRA v2.2 Weather Balloon Analysis - Improved Version
# © Copyright (Andy May) - Updated & Optimized December 2025

library(data.table)
library(dplyr)
library(zoo)
library(ggplot2)
library(purrr)    # for map(), safely(), etc.
library(readr)    # faster CSV reading if needed

root_dir <- "D:/Climate_Change/weather_balloons/NOAA/Analysis/"
month_dir <- file.path(root_dir, "Month_records")
output_dir <- file.path(month_dir, "bin_plots")

# Create output directory once
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

# Load plot configuration
plot_scales <- fread(file.path(root_dir, 'variable_lists_details/plot_scales_title.csv'))

# Define which variables to plot 
plot_indices <- c(2, 4, 5, 6, 8, 9, 10, 11)

# Pre-compute month abbreviations for titles
month_abbs <- month.abb

imonth<-1
# Main processing loop over months
for (imonth in 1:12) {
  
  fname <- file.path(month_dir, paste0("Month_", imonth, ".rds"))
  message("Processing ", month_abbs[imonth], " (", fname, ")")
  
  Month <- readRDS(fname)
  
  # Create 10-degree latitude slices (-85 to 85 typically)
  Month[, lat_slice := floor(lat / 10) * 10]  # cleaner than as.integer(lat/10)
  
  # Get unique latitude slices
  unique_slices <- sort(unique(Month$lat_slice))

  lat_val <- unique_slices[1]  
  # Process each latitude slice
  for (lat_val in unique_slices) {
    
    cur_lat <- Month[lat_slice == lat_val]
    
    # Sort by decreasing pressure (standard atmospheric profile)
    setorder(cur_lat, -hPa)
    
    # Fill small gaps (max 9 levels) - vectorized and safer
    vars_to_interp <- c("hPa", "temp", "mol_den", "gph", "q_gkg", "rh", "wspd", "wdir")
    for (v in vars_to_interp) {
      if (any(!is.na(cur_lat[[v]]))) {
        cur_lat[[v]] <- zoo::na.approx(cur_lat[[v]], maxgap = 9, na.rm = FALSE)
      }
    }
    
    # Skip if essential data missing
    if (anyNA(cur_lat$mol_den) || anyNA(cur_lat$hPa)) next
    
    # === Fit upper and lower molar density vs pressure lines ===
    lower <- cur_lat[hPa > 300 & hPa < 750]
    upper <- cur_lat[hPa < 125]
    
    lower_slope <- lower_int <- upper_slope <- upper_int <- NA_real_
    
    if (nrow(lower) > 2) {
      lm_lower <- lm(hPa ~ mol_den, data = lower)
      lower_slope <- coef(lm_lower)[2]
      lower_int   <- coef(lm_lower)[1]
    }
    
    if (nrow(upper) > 2) {
      lm_upper <- lm(hPa ~ mol_den, data = upper)
      upper_slope <- coef(lm_upper)[2]
      upper_int   <- coef(lm_upper)[1]
    }
    
    # === Find intersection of upper and lower fits ===
    int_hPa <- int_mol_den <- NA_real_
    
    if (all(!is.na(c(upper_slope, upper_int, lower_slope, lower_int)))) {
      # Solve: hPa = m1 * mol_den + b1
      #        hPa = m2 * mol_den + b2
      A <- matrix(c(1, -upper_slope, 1, -lower_slope), nrow = 2, byrow = TRUE)
      B <- c(upper_int, lower_int)
      sol <- solve(A, B)
      int_hPa     <- sol[1]
      int_mol_den <- sol[2]
    }
    
    # === Interpolate key variables at intersection pressure ===
    interp_at_int <- function(var) {
      if (!is.null(cur_lat[[var]]) && sum(!is.na(cur_lat[[var]])) > 2 && 
          !is.na(int_hPa) && int_hPa > 0) {
        approx(cur_lat$hPa, cur_lat[[var]], xout = int_hPa, rule = 1)$y
      } else NA_real_
    }
    
    int_temp  <- interp_at_int("temp")
    int_gph   <- interp_at_int("gph")
    int_wspd  <- interp_at_int("wspd")
    int_wdir  <- interp_at_int("wdir")
    int_rh    <- interp_at_int("rh")
    int_q_gkg <- interp_at_int("q_gkg")
    
    # Derived quantities at intersection
    int_MW      <- if (!is.na(int_rh)) 28.97 * (1 - int_rh/100) + 18.02 * (int_rh/100) else NA
    int_spDen   <- if (!is.na(int_mol_den) && !is.na(int_MW)) int_mol_den * int_MW else NA
    int_M_flux  <- if (!is.na(int_spDen) && !is.na(int_wspd)) int_spDen * int_wspd else NA
    int_NS_flux <- if (!is.na(int_wspd) && !is.na(int_wdir) && !is.na(int_spDen)) {
      -int_wspd * sin(int_wdir * pi / 180) * int_spDen
    } else NA
    
    # === Add derived columns to full profile ===
      # Geopotential height in km
        cur_lat[, gph_km := gph / 1000]

      # Specific density (g/m�)
        cur_lat[, rho_i := mol_den * (28.97 * (1 - rh/100) + 18.02 * (rh/100))]

      # Northward velocity (+ is north)
        cur_lat[, v_i := -wspd * cos(wdir * pi / 180)]

      # Eastward velocity (positive east)
        cur_lat[, u_i := -wspd * sin(wdir * pi / 180)]

      # Meridional (N-S) and total mass flux
        cur_lat[, NS_flux := rho_i * v_i]
        cur_lat[, M_flux := rho_i * wspd]    
    
      # Lapse rate (K/km, positive = stable)
      # Define the grouping key for one full profile (adjust if needed)
        profile_cols <- c("id", "year", "month", "day", "hour")  # or include 
                                                                 # reltime if needed

      # First, make sure data is sorted by decreasing pressure or increasing 
      # gph within each profile
      # Usually we sort by decreasing hPa (surface to top) or increasing gph
        cur_lat <- cur_lat[order(id, year, month, day, hour, -hPa), ]  # or use 
                                                                       # gph ascending

      # Now compute lapse rate BY GROUP
        cur_lat[, `:=`(
          delta_temp = data.table::shift(temp, n=1, type="lead") - 
                       data.table::shift(temp, n=1, type="lag"),
          delta_gph_km = (data.table::shift(gph, n=1, type="lead") - 
                          data.table::shift(gph, n=1, type="lag")) / 1000
        ), by = profile_cols]

        cur_lat[, lapse_rate := -delta_temp / delta_gph_km]

      # Clean up temporary columns
        cur_lat[, c("delta_temp", "delta_gph_km") := NULL]
    
    # === Plotting loop ===
    for (iplot in plot_indices) {
      
      x_var <- plot_scales$Name[iplot]
      if (!x_var %in% names(cur_lat)) next
      
      # Prepare data for binning
      plot_dat <- cur_lat[!is.na(get(x_var)) & !is.na(hPa), .(hPa, x_val = get(x_var))]
      
      # Pressure bins: 10 hPa intervals
      breaks <- seq(10, 1030, by = 10)
      plot_dat[, press_bin := cut(hPa, breaks = breaks, include.lowest = TRUE, labels = FALSE)]
      
      binned <- plot_dat[!is.na(press_bin),
        .(avg_hPa = mean(hPa),
          avg_x   = mean(x_val),
          n_obs   = .N),
        by = press_bin
      ][n_obs > 0]
      
      # File and title
      plotname <- file.path(output_dir,
                            sprintf("%s_Lat_slice_%d_%s.png",
                                    month_abbs[imonth], lat_val, x_var))
      
      title <- sprintf("%s, Latitude %d\u00b0 to %d\u00b0, %s",
                 month_abbs[imonth], lat_val, lat_val + 10, plot_scales$Title[iplot])
      
      p <- ggplot(binned, aes(x = avg_x, y = avg_hPa)) +
        geom_point(size = 1.5) +
        scale_y_reverse(limits = c(plot_scales$base[1], plot_scales$top[1])) +
        scale_x_continuous(limits = c(plot_scales$top[iplot], plot_scales$base[iplot])) +
        labs(title = title,
             x = paste0(plot_scales$Title[iplot], " (", plot_scales$Units[iplot], ")"),
             y = "Pressure (hPa)") +
        theme_bw(base_size = 12) +
        theme(plot.title = element_text(size = 16, face = "bold"),
              axis.title = element_text(size = 14))
      
      # Add intersection line
      if (!is.na(int_hPa)) {
        p <- p + geom_hline(yintercept = int_hPa, color = "blue", linetype = "dashed")
      }
      
      # Special overlays
      if (iplot == 2) {  # molar density plot
        if (!is.na(upper_slope)) {
          p <- p + geom_abline(slope = -upper_slope, intercept = -upper_int, color = "blue")
        }
        if (!is.na(lower_slope)) {
          p <- p + geom_abline(slope = -lower_slope, intercept = -lower_int, color = "green")
        }
      }
      
      if (iplot == 5) {  # lapse rate - WMO tropopause cutoff
        p <- p + geom_vline(xintercept = 2, color = "blue", linetype = "dashed")
      }
      
      if (iplot %in% c(8, 10, 11)) {  # u, v, NS_flux
        p <- p + geom_vline(xintercept = 0, color = "blue", linetype = "dashed")
      }
      # print(p)
      
      ggsave(plotname, plot = p, width = 8, height = 6, dpi = 300, bg = "white")
    } # end plot loop
  } # end for lat_val in unique_slices
  
  # Cleanup per month
  rm(Month, cur_lat)
  gc()
}

message("All months and latitude slices processed successfully!")