#! /bin/bash
# Last edited on 2024-10-06 18:49:27 by stolfi

PROG_NAME=${0##*/}
PROG_DESC="find all source-like files under given directories"
PROG_HELP=(
  "${PROG_NAME} [ -notheses ] {DIR}.."
)
PROG_INFO=(
  "\nNAME"
  "\n  ${PROG_NAME} - ${PROG_DESC}."
  "\n"
  "\nSYNOPSIS"
  "\n  ${PROG_HELP[@]}"
  "\n"
  "\nDESCRIPTION"
  "\n  Writes to \".all-sources\" a list of all source-like files"
  "\n  in the specified directories."
  "\n"
  "\n  Includes files with standard source-file extensions, such as"
  "\n  \".c\", \".h\" (C language), \".cpp\" (C++) \".awk\"  (AWK), \".gawk\"  (GAWK),"
  "\n  \".pas\" (Pascal), \".f\" (FORTRAN), \".lsp\" (Lisp),"
  "\n  \".el\" (Emacs Lisp), \".py\" (Pyton), \".sh\" (bash), \".csh\" (C shell),"
  "\n  \".i3\", \".m3\", \".ig\", and \".mg\" (Modula-3)."
  "\n"
  "\n  Also includes files called \"Makefile\" or with \".make\" extension,"
  "\n  except \"Deps.make\"."
  "\n"
  "\n  Excludes from the list garbage files.  These include files that have names"
  "\n  containing an iso-style date like \"2019-11-19\""
  "\n  (delimited by dashes or slashes) or"
  "\n  any of the strings \"junk\", \"old\", \"snap\", \"snapshot\", or \"save\""
  "\n  (delimited by periods, dashes, or slashes), in any capitalization; and"
  "\n  directory trees whose root is called \"00-ARCHIVE\","
  "\n  \"00-MIRROR\", \"00-BACKUP\", \"00-RESTORE\","
  "\n  \"GARBAGE\" \"IMPORT-*\", \"PUB/include\", \"pkg\","
  "\n  \"posters/*/[0-9]*[0-9][0-9][-0-9]*[0-9]/\","
  "\n  or \"hand-in\", in that capitalization."
  "\n" \
  "\n  Also writes to \".chfiles\" the subset"
  "\n  of \".all-sources\" that are \"Makefile\" or"
  "\n  have extension \".c\" or \".h\"." \
  "\n" \
  "\nOPTIONS"
  "\n  -notheses"
  "\n    Excludes subdirectories called \"theses\", too, in any"
  "\n    capitalization."
  "\n"
  "\n  -exclude {PATH_PAT}"
  "\n    Excludes subdirectories whose full pathname"
  " matches {PATH_PAT}. Use \"**\" for any string"
  " including slashes, \"*\" for any string excluding slashes."
  "\n"
  "\nSEE ALSO"
  "\n  find(1)"
  "\nAUTHOR"
  "\n  Created 2006-05-08 by Jorge Stolfi, Unicamp"
)

# ----------------------------------------------------------------------
# INTERNAL OPTIONS

# ----------------------------------------------------------------------
# COMMAND LINE PARSING

# Parse command line switches: 
notheses=( cat )
excludes=( )
while [[ $# -ge 1 ]]; do
  opt="$1"
  if [[ ( $# -ge 2 ) && ( "/${opt}" == "/-exclude" ) ]]; then 
    excludes+=( -path "'$2'" -prune -o );
    echo "excluding '$2'" 1>&2
    shift; shift
  elif [[ "/${opt}" == "/-notheses" ]]; then 
    notheses=( egrep -v -i -e "[\\/]theses|teses[\\/]" );
    shift;
  elif [[ "/$1" =~ ^[/][-].* ]]; then
    echo "unknown option ${opt}" 1>&2 ;
    echo -e "usage:\n  ${PROG_HELP[@]}" 1>&2 ; exit 1
  else
    break
  fi
done 

# echo "notheses = ( ${notheses[*]} )" 1>&2

# Get positional parameters
dirs=( \
  ` echo "$@" \
      | tr ' ' '\012' \
      | ${notheses[@]} \
  ` \
)
echo "dirs = [${dirs[@]}]" 1>&2
if [[ ${#dirs[@]} == 0 ]]; then
  dirs=( ./ /home/stolfi2/stolfi/ )
fi

echo "searching directories ${dirs[*]}" 1>&2

# # Check for leftover arguments:
# if [[ $# -ne 0 ]]; then
#   echo 'wrong number of arguments "'"$1"'" ...' 1>&2
#   echo -e "usage:\n  ${PROG_HELP[@]}" 1>&2 ; exit 1 
# fi

# END COMMAND LINE PARSING
# ----------------------------------------------------------------------

# Prefix for temporary file names
tmp="/tmp/$$"

echo "looking for source files that are identifiable by name ..." 1>&2
find "${dirs[@]}" \
    -name '*-dotfiles*' -prune -o \
    -name '.[a-zA-Z0-9]*' -prune -o \
    -name '00-ARCHIVE' -prune -o \
    -name '00-BACKUP' -prune -o \
    -name '00-MIRROR' -prune -o \
    -name '00-RESTORE' -prune -o \
    -name '00-TARFILES' -prune -o \
    -name '00-TARFILES-CONVERTED' -prune -o \
    -name '00-TO-CD' -prune -o \
    -name 'ARCHIVE' -prune -o \
    -name 'ChromiumUnsnapped' -prune -o \
    -name 'GARBAGE' -prune -o \
    -name 'IMPORT-*' -prune -o \
    -name 'JSLIBS-backup' -prune -o \
    -name 'JSLIBS-LATER' -prune -o \
    -name 'JUNK' -prune -o \
    -name 'MODPKG' -prune -o \
    -name 'OLD' -prune -o \
    -name 'SAVE' -prune -o \
    -name 'hand-in' -prune -o \
    -name 'include' -prune -o \
    -name 'out' -prune -o \
    -name 'packages' -prune -o \
    -name 'pkg' -prune -o \
    -name 'snap' -prune -o \
    \
    ${excludes[@]} \
    -path '**/Download/books/everett-piraha' -prune -o \
    -path '**/Download/samsung-galaxy4/Pictures' -prune -o \
    -path '**/Download/videos' -prune -o \
    -path '**/Pictures/Webcam' -prune -o \
    -path '**/WhatsApp/Media' -prune -o \
    -path '**/bureau/diretoria' -prune -o \
    -path '**/mc857/2020-1/notas/ra' -prune -o \
    -path '**/posters/*/[0-9][0-9][-0-9]*[0-9]*' -prune -o \
    -path '**/programs/python/mc857-*' -prune -o \
    -path '**/projects/biobank/**/item' -prune -o \
    -path '**/projects/bitcoin/bitcointalk/Gavin_Andresen' -prune -o \
    -path '**/projects/bitcoin/wallets/raw/BitPay.com/[0-9][0-9]*' -prune -o \
    -path '**/projects/eleicoes/2010/dados-por-urna*' -prune -o \
    -path '**/projects/fragments/IAB-2002/2009-01-30-manaus/RAW' -prune -o \
    -path '**/projects/fragments/ceramic-3/data' -prune -o \
    -path '**/projects/fragments/glazed-1/data' -prune -o \
    -path '**/projects/image-collections/life' -prune -o \
    -path '**/projects/imgbank/**/[0-9][0-9][0-9]' -prune -o \
    -path '**/projects/imgbank3/**/[0-9][0-9][0-9]' -prune -o \
    -path '**/projects/musis/2009-12-11-ceaz-thesis' -prune -o \
    -path '**/projects/musis/[A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9][A-Z0-9]' -prune -o \
    -path '**/projects/neuromat/00-DATA/*/*runs' -prune -o \
    -path '**/projects/neuromat/00-DATA/*/*runs-[A-Z]' -prune -o \
    -path '**/projects/stereo-linescan/images' -prune -o \
    -path '**/projects/terras-indigenas/raw' -prune -o \
    -path '**/projects/text-tracking/FETCH' -prune -o \
    -path '**/projects/urna/urna-2010/dados-por-urna' -prune -o \
    -path '**/projects/usd-inflation/00-DATA/split' -prune -o \
    -path '**/projects/voynich/work/Notes/057/stats' -prune -o \
    -path '**/projects/wikimapia/obj' -prune -o \
    -type f \
      \( \
        -name '*.[chf]' -o \
        -name '*otebook.*[a-zA-Z0-9]' -o \
        -name 'Makefile' -o \
        -name '*README*' -o \
        -name '*.make' -o \
        -name '*.cc' -o \
        -name '*.el' -o \
        -name '*.cpp' -o \
        -name '*.pas' -o \
        -name '*.sh' -o \
        -name '*.csh' -o \
        -name '*.java' -o \
        -name '*.lsp' -o \
        -name '*.awk' -o \
        -name '*.gawk' -o \
        -name '*.nawk' -o \
        -name '*.py' -o \
        -name '*.pl' -o \
        -name '*.[im][3g]' \
      \) -print \
  > ${tmp}.chm

echo "sorting file names and removing additional trash files ..." 1>&2
cat ${tmp}.chm \
  | egrep -i -v -e '(^|[-/.])(junk|save|snap|snapshot|old).*[-\/.]' \
  | egrep -v -e '[/]Deps.make$' \
  | egrep -v -e '[~]$' \
  | ${notheses[@]} \
  | sed \
      -e 's:^[.][/]::' \
      -e 's:^/home/[j]*stolfi(|local)/::g' \
      -e 's:^/home/staff/[j]*stolfi(|local)/::g' \
      -e 's:^/home/stolfi2/stolfi/::g' \
  | sort -r \
  | uniq \
  > .all-sources
  
echo "separating C sources ..." 1>&2

cat .all-sources \
  | egrep -e 'Makefile|[.][ch]$' \
  | egrep -v -e 'SAVE|JUNK|OLD' \
  | sort -r | uniq \
  > .chfiles

/bin/rm -f ${tmp}.chm ${tmp}.exs 
