#! /usr/bin/python3 
# Last edited on 2025-08-24 05:05:22 by stolfi

# Reads a set of {nb} narrow-band monochrome images for a given page and
# clip. Extracts from those images a set of pixels, selected by a binary
# mask image.  For those pixels, computes
#
#   {savg} the mean spectrum
#
#   {smin} the "darkest" spectrum
#
#   {smax} the "lightest" spectrum
#
#   {P}    a matrix with the {ne} principal components of those spectra,
#
#   {rdev} the total variance of those spectra that is not 
#          accounted for by those principal components.
# 
# The "darkest" spectrum {smin} is such that, for any spectral band {ib}
# in {0..nb-1}, only 0.025 of the sampled pixels have lower albedo in
# that band than {smin[ib]}. The "lightest" spectrim is similar but with
# "higer alebdo" insread of "lower albedo".
#
# The principal components are the {ne} eigenvectors of the covariance
# matrix with largest eigenvalues, scaled by the square root of those
# eignevalues. The number {ne} is currently fixed at 10.
#
# The parameters are the {page}, the clip's name {clip}, and the mask name
# {mask}.
#
# The program repeats the analysis for a list of {nm} mask names. It
# saves the results of all these analyses to text files. 
#
# Each eigenvector selected as principal component has its diection
# flipped as needed so that the sum of the elements is non-negative.
# This means that increasng the albedo of a pixel on all bands by the
# same amount will cause the projection on that eigenvector to remain
# the same or increase, never decrease.
#
# INPUT FILES
# 
# Assumes that there is a file "MS/davis/{page}/bands.txt" that specifies the
# names of the bands avaliable for that page (e.g. "MB870IR_030_F")
# as well as their illumnation type {illum} ({0..3}) and main wavelength {wlen}.
#
# The input images are read from "{cdir}/npy/{band}.npy", where {cdir}
# is "pages/{page}/clips/{clip}", for each {band} listed in the
# "bands.txt" file above that was imaged in reflected light, with
# frontal narrow-band illumination ({illum == 0} and name starts with
# "MB"). 
#
# These clip image files are assumed to be {numpy} 2D matrices of floats
# stored with {numpy.save}. The values are assumed to have been
# extracted from the TIFF files without any gamma mapping (other than
# whatever was used to encode those file by the scanning team) and
# scaled from some integer range {0..maxval}, specific to each band, to
# the float range {[0 _ 1]}.
#
# Each mask image is read from the local file "{cdir}/masks/{mask}.png",
# where {mask} is the mask's name(e.g. "ch", "di", "bg"). This file is
# supposed to be in PNG (not PGM) format. The mask image should have the
# same dimensions as the clip images, with pixels either 0 ("ignore") or
# 1 ("take").
#
# MAIN OUTPUT FILES
#
# The principal components and other info for each {mask} are saved to disk as as file
# "{mdir}/analysis.txt" where {mdir} is "pages/{page}/clips/{clip}/masks/{mask}".
#
# The first two lines of each of these files are "nb = {nb}" and "ne = {ne}".
#
# After this header come {nb} lines, each with {1 + (ne+2)*nm} columns. Each
# line except the last one has the format 
#  
#   "{band[ib]} {illum[ib]} {wlen[ib]} {data[0]} ...{data[nm-1]}"
#
# where {band[ib]} is the name of a spectrum band, {illum[ib]} and {wlen[ib]}
# are the illumination type and wavelength of that band (as in the "bands.txt" file)
# and {data[im]} is data for mask {im}. 
#
# The data for each mask is {ne+5} columns
#   
#   "{savg[ib,im]} {smin[ib,im]} {smax[ib,im]} {rdev[ib,im]} {tdev[ib,im]}" + 
#   " {P[0,ib,im]} ... {P[ne-1,ib,im]}"
#
# where {tdev[ib,im]} is the magnitude of the residual not accounted for
# by the first component {P[ib,0,im]}, that is, {tdev[ib,im] =
# sqrt(SUM(P[ib,ie,im]^2 : ie\in 1..ne-1) + rdev[ib,im]^2)}
#
import sys, re, os
from math import sqrt, sin, cos, log, exp, floor, pi, inf
import numpy
from PIL import Image
from error_funcs import prog_error, arg_error, debug
from process_funcs import bash, run_command
from bands_table_funcs import read_page_bands_tables
from image_funcs import read_mask_image, read_clip_npy_images
import argparser

def main():

  page, clip, masks = parse_options()
  nm = len(masks)

  # masks the spectral bands tables:
  bfile = f"MS/davis/{page}/bands.txt"
  bands_tb = read_page_bands_tables(bfile)
  
  # Select the bands to use:
  bands = []      # Bands included in the analysis.
  for band in bands_tb.keys():
    illum = bands_tb[band]['illum']
    if illum == 0:
      assert band[0:2] == "MB", "bug: band image name"
      bands.append(band)
  nb = len(bands)
  debug("bands used", bands)
  assert nb > 0, "bug nb"
  
  # Get the clip images:
  C = read_clip_npy_images(page, clip, bands)
  assert numpy.size(C,2) == nb
  ny, nx = numpy.shape(C[:,:,0])
  
  # Compute the principal components and extremal spectra:
  ne = 10; # Number of principal components to keep.
  savg = numpy.zeros((nb,nm,))
  smin = numpy.zeros((nb,nm,))
  smax = numpy.zeros((nb,nm,))
  rdev = numpy.zeros((nb,nm,))
  P = numpy.zeros((nb,ne,nm,))
  
  for im in range(nm):
    mask = masks[im]
    sys.stderr.write(f"=== mask = {mask} ===============================================\n")
    
    # Get the mask image:
    mfile = f"pages/{page}/clips/{clip}/masks/{mask}/mask.png"
    M = read_mask_image(mfile)
    
    # Obtain the relevant pixel spectra:
    S = extract_masked_pixel_spectra(C, M)
    ns = numpy.size(S,0)
    fs = ns/(ny*nx)
    sys.stderr.write(f"mask {mask}: selected {ns} samples of {nx*ny} ({fs:.4f})\n")
    debug("S[0,:]", S[0,:])
    debug("S[:,0]", S[:,0])
    assert numpy.shape(S) == (ns,nb,)

    # Compute principal components:
    savg, rdev, P = principal_components(S, ne)
    ne = numpy.size(P,1)
    assert numpy.shape(savg) == (nb,), "bug savg shape"
    assert numpy.shape(rdev) == (nb,), "bug rdev shape"
    assert numpy.shape(P) == (nb,ne,), "bug P shape"
      
    # Compute extremal spectra:
    u = P[:,0]/numpy.linalg.norm(P[:,0])
    smin, smax = extremal_spectra(S, savg, u)
  
    # Save analysis results:
    write_analysis_data \
      ( page, clip, mask, bands, bands_tb, 
        savg, smin, smax, rdev, P
      )

  return 0
  # ----------------------------------------------------------------------
  
def extract_masked_pixel_spectra(C, M):
  # Reads the image of the given {page} and {clip} illuminated by the
  # given {band}. Extracts the elements where the mask image {M} is
  # nonzero, and returns them as a list, in row-by-row order.
  #
  # Both {M} and the clip image must have shape {(ny,nx)}.
  
  ny, nx, nb = numpy.shape(C)
  if numpy.shape(M) != (ny, nx):
    arg_error(f"mask and clip have different sizes {(nx,ny)} {numpy.shape(M)}")

  # Colect the masked samples:
  ns = None; # For now.
  S = None # For now.
  for ib in range(nb):
    samples = \
      [ min(1.0, C[iy,ix,ib]) \
        for iy in range(ny) \
          for ix in range(nx) \
            if M[iy,ix] != 0 \
      ]
    if ns == None:
      ns = len(samples)
      S = numpy.zeros((ns,nb,))
    else:
      assert len(samples) == ns, "bug inconsitent sample counts"
    S[:,ib] = numpy.array(samples)
  return S
  # ----------------------------------------------------------------------
  
def principal_components(S, ne):
  # Expects {S} to be a matrix of pixel spectra. It must have shape
  # {ns,nb}, where {ns} is the number of relevant pixels and {nb} is the
  # number of spectral bands.
  #
  # Computes the average {savg} of all the spectra, and their covariance
  # matrix {V}, with shape {(nb,nb)}. Then computes the eigenvectors of
  # {V} as the columns of a matrix {E} with shape {(nb,nb)}, and the
  # vector of the respective eigenvalues {e}. Then extracts the {ne}
  # eigenvectors with largest eigenvalues, each scaled by the square
  # root of its eigenvalue, as the rows of a matrix {P} with shape
  # {(nb,ne)}. Also, for each band {ib} computes {rdev[ib]} which is the
  # Euclidean norm of {( sqrt(e[ke])*E[ib,ke] : ke \in ne..nb-1 )}.
  # 
  # Returns {savg,rdev,P}.
  
  ns, nb = numpy.shape(S)
  
  # Compute the barycenter (mean) {savg} of the spectra:
  savg = barycenter(S)
  debug("max(abs(savg))", numpy.max(numpy.abs(savg)))
  assert numpy.shape(savg) == (nb,), "bug: savg shape"
  
  # Compute {D} whose rows are the displacements from barycenter:
  D = numpy.array([ S[ks,:] - savg for ks in range(ns) ])
  debug("max(abs(D))", numpy.max(numpy.abs(D)))
  
  # Compute the covariance matrix {V=D'*D}:
  V = (D.transpose() @ D) / float(ns)
  debug("max(V)", numpy.max(V))

  # Compute the eigenvalues {E} and eigenvectors {e}:
  eig = numpy.linalg.eigh(V)

  e = eig.eigenvalues
  sys.stderr.write(f"eigenvalues:\n{e}\n");
  assert numpy.shape(e) == (nb,), "bug: eigenvalues shape"

  E = eig.eigenvectors
  sys.stderr.write(f"eigenvectors:\n{E}\n")
  assert numpy.shape(E) == (nb,nb,), "bug: eigenvectors shape"
  
  # Extract the {ne} eigenvectors with largest eigenvalues:
  P = [ sqrt(e[nb-1-ie])*E[:,nb-1-ie] for ie in range(ne) ]
  P = numpy.array(P).transpose()
  
  # Adjust the sign of the selected eigenvectors:
  for ie in range(ne):
    sum = numpy.sum(P[:,ie])
    if sum < 0: P[:,ie] = -P[:,ie]

  # Compute the residual deviation {rdev}:
  rdev = numpy.zeros((nb,))
  for ib in range(nb):
    sum = 0
    for ke in range(nb-ne):
      sum += e[ke]*E[ib,ke]**2
    rdev[ib] = sqrt(sum)
    
  assert numpy.shape(savg) == (nb,)
  assert numpy.shape(P) == (nb,ne,)
  assert numpy.shape(rdev) == (nb,)

  check_principal_components(S, savg, P)

  return savg, rdev, P
  # ----------------------------------------------------------------------
  
def extremal_spectra(S, savg, u):
  # Expects {S} to be a matrix of pixel spectra. It must have shape
  # {ns,nb}, where {ns} is the number of relevant pixels and {nb} is the
  # number of spectral bands. Expects savg to be the nominal barycenter
  # of all those spectra. Expecs {u} to be the principal eigenvector of
  # the cloud, with unit norm and shape {(nb,)}, oriented from "dark" to
  # "bright".
  #
  # Computes the "darkest" spectrum {smin} and the "lightest" spectrum
  # {smax} based on the projection {t[ks]} of the pixel spectra {S[ks,:]
  # - savg} along the direction of {P0}.
  #
  # Specificaly, let {tmin} and {tmax} be the 
  # the {fmin} and {1-fmin} fractiles of the projections {t[:]}.
  # The "darkest" spectrum {smin[:]} is defined as {savg + tmin*P0},
  # and the "ligtest" spectrum is defined as {savg + tmax*P0}.
  # 
  # Returns {smin,smax}.
  
  ns, nb = numpy.shape(S)
  assert numpy.shape(savg) == (nb,)
  assert numpy.shape(u) == (nb,)
  
  # Collect the total projections of all the pixels:
  t = []
  for ks in range(ns):
    t.append(numpy.dot(S[ks,:] - savg, u))
  t = numpy.array(t)
  assert numpy.shape(t) == (ns,), "bug t shape"
    
  # Get the fractiles:
  fmin = 0.02
  tmin = numpy.quantile(t, fmin)
  tmax = numpy.quantile(t, 1 - fmin)
  debug("t quantiles", (tmin, tmax,))
  assert tmin < 0 and tmax > 0, "bug bad tmin,tmax signs"
  
  # Compute the extremal spectra:
  smin = savg + tmin*u
  smax = savg + tmax*u
  assert numpy.shape(smin) == (nb,)
  assert numpy.shape(smax) == (nb,)
  return smin, smax
  # ----------------------------------------------------------------------

def barycenter(P):
  # Expects {P} to be a {numpy} matrix. Computes the 
  # average of all rows. 
  savg = numpy.sum(P, 0)
  debug("shape(savg)", numpy.shape(savg))
  savg = savg/numpy.size(P, 0)
  return savg
  # ----------------------------------------------------------------------
  
def check_principal_components(S, savg, P):
  # Checks whether the projections of the pixel spectra in {S}
  # along the direction of {P} has indeed deviation {|P|}.
  
  ns, nb = numpy.shape(S)
  ne = numpy.size(P,1)
  assert numpy.shape(P) == (nb, ne,), "bad {P} shape"
  assert numpy.shape(savg) == (nb,), "bad {savg} shape"
  for ie in range(ne):
    Pi = P[:,ie]
    Pmod = numpy.linalg.norm(Pi)
    
    # Check mean in direction {Pi}:
    savg_proj = (savg @ Pi)/Pmod
    S_proj = (S @ Pi)/Pmod
    S_proj_avg = numpy.mean(S_proj)
    err_avg = S_proj_avg - savg_proj
    if err_avg > 1.0e-10*sqrt(ns):
      sys.stderr.write(f"savg_proj = {savg_proj}  S_proj_avg = {S_proj_avg} err = {err_avg}\n")
      assert False, "bug principal component: mean"
    
    # Check deviation in direction {pi}:
    S_proj_dev = numpy.std(S_proj)
    err_dev = Pmod - S_proj_dev 
    if err_dev > 1.0e-10*sqrt(ns):
      sys.stderr.write(f"Pmod = {Pmod}  S_proj_dev = {S_proj_dev} err = {err_dev}\n")
      assert False, "bug principal component: deviation"
  return 
  # ----------------------------------------------------------------------

def write_analysis_data(page, clip, mask, bands, bands_tb, savg, smin, smax, rdev, P):
  # Writes a file with the analysis
  # results for the subset of samples of clip {clip} in page {page}
  # selected by the mask {mask}, namely {savg,smin,smax,rdev,P}.
  # Assumes that the analysis used the spectral bands
  # with names {bands[0..nb-1]}. See the top of this
  # module for the format of this file.
  #
  nb, ne = numpy.shape(P)
  assert len(bands) == nb, "inconsitent {bands}"
  assert numpy.shape(savg) == (nb,)
  assert numpy.shape(smin) == (nb,)
  assert numpy.shape(smax) == (nb,)
  assert numpy.shape(rdev) == (nb,)

  adir = f"pages/{page}/clips/{clip}/masks/{mask}"
  bash(f"mkdir -p {adir}")
  afile = f"{adir}/analysis.txt"
  sys.stderr.write(f"writing {afile} ...\n")

  sfmt = " %+8.4f" # Format for signed PC components
  pfmt = " %7.4f" # Format for non-negative PC/spectrum components

  with open(afile, "w") as wr:
    wr.write(f"nb = {nb}\n")
    wr.write(f"ne = {ne}\n")
    for ib in range(nb):
      band = bands[ib]
      wr.write("%-15s" % band)
      wr.write(f" {bands_tb[band]['illum']:d}")
      wr.write(" %7.1f" % bands_tb[band]['wlen'])

      wr.write("  ")

      for svar in ( savg, smin, smax, rdev ):
        wr.write(pfmt % svar[ib])

      # Total PC magnitude except the first one:
      sum2_P = 0;
      for ke in range(1,ne): sum2_P += P[ib,ke]**2
      tdev = sqrt(sum2_P + rdev[ib]**2)
      wr.write(pfmt % tdev)

      for ie in range(ne):
        wr.write(sfmt % P[ib,ie])

      wr.write("\n")
    wr.close()
  return
  # ----------------------------------------------------------------------

def parse_options():
  na = len(sys.argv)
  assert na >= 4, "insuff args"
  page = sys.argv[1]
  clip = sys.argv[2]    # name of clip to take pixels from.       
  masks = [ sys.argv[k] for k in range(4,na) ]
  return page, clip, masks
  # ----------------------------------------------------------------------

def band_error(band, msg):
  sys.stderr.write(f"** {band}: {msg}\n")
  assert False
  # ----------------------------------------------------------------------

main()