#! /bin/bash
# Last edited on 2024-03-27 19:11:49 by stolfi

PROG_NAME=${0##*/}
PROG_DESC="a filter that reads the output of {find-all-files-size-date}, excludes junk and archives"
PROG_HELP=( "${PROG_NAME} < {INFILE}.sdf > {OUTFILE}.sdf" )
PROG_INFO=(
  "\nNAME"
  "\n  ${PROG_NAME} - ${PROG_DESC}."
  "\n"
  "\nSYNOPSIS"
  "\n  ${PROG_HELP[@]}"
  "\n"
  "\nDESCRIPTION"
  "\n"
  "\n Reads a list of files from {stdin}. Discards uninteresting files, writes the rest to stdout."
  "\n" 
  "\n The input list may be"
  "\n  - a plain list of file names;"
  "\n  - list of links with ' -> ';"
  "\n  - list of size-date-filename as produced by {find_all_files_size_date.sh}"
  "\n  - list of cksum-size-filename as produced by {find_all_files_cksum_size.sh}"
  "\n  - list of cksum-size-date-filename as produced by {find_all_files_cksum_size_date.sh}"
  "\n"
  "\n  Excludes from the list the known cache files such as from \".cache\" and \".thumbnail\" directories. "
  "\n  Also excludes files in the directories \"junk\", \"old\", \"snap\", \"snapshot\", or \"save\""
  "\n  (delimited by periods, dashes, or slashes), in any capitalization; and"
  "\n  directory trees whose root is called \"00-ARCHIVE\","
  "\n  \"00-MIRROR\", and a few others." 
  "\n" 
  "\n Also excludes the executable binaries listed in {${HOME}/programs/binaries.dir}"
  "\n" \
  "\nSEE ALSO"
  "\n  find_all_files_size_date.sh, find_all_files_cksum_size.sh, find_all_files_cksum_size_date.sh"
  "\nAUTHOR"
  "\n  Created 2020-09-04 by Jorge Stolfi, Unicamp"
  "\n  Modified 2022-10-16 by Stolfi"
)

# Parse arguments:

extilde=0
while  [[ $# -gt 0 ]]; do
  if [[ "/$1" == "/-exclude-tilde" ]]; then
    extilde=1; shift;
  else
    echo 'unrecognized argument "'"$1"'" ...' 1>&2
    echo -e "usage:\n  ${PROG_HELP[@]}" 1>&2 ; exit 1 
  fi
done

# Prefix for temporary file names
tmp="/tmp/$$"

echo "excluding junk files ..." 1>&2
xfile="${tmp}_xtrash.sed"

echo "preparing ${xfile} ..." 1>&2

cat >${xfile} <<EOF

# Files with blanks in name:
\:[\\][ ]:d

# Trash top directories:
\:\([ ]\|^\)[.]adobe\b:d
\:\([ ]\|^\)[.]audacity\b:d
\:\([ ]\|^\)[.]cache\b:d
\:\([ ]\|^\)[.]ccache\b:d
\:\([ ]\|^\)[.]dasher\b:d
\:\([ ]\|^\)[.]dbus\b:d
\:\([ ]\|^\)[.]designer\b:d
\:\([ ]\|^\)[.]dropbox\b:d
\:\([ ]\|^\)[.]dt\b:d
\:\([ ]\|^\)[.]dups\b:d
\:\([ ]\|^\)[.]eXtace\b:d
\:\([ ]\|^\)[.]eggcups\b:d
\:\([ ]\|^\)[.]emacs[.]d/session\b:d
\:\([ ]\|^\)[.]esd_auth\b:d
\:\([ ]\|^\)[.]esmtp_queue\b:d
\:\([ ]\|^\)[.]evolution\b:d
\:\([ ]\|^\)[.]fontconfig\b:d
\:\([ ]\|^\)[.]fullcircle\b:d
\:\([ ]\|^\)[.]galeon\b:d
\:\([ ]\|^\)[.]gconf3\b:d
\:\([ ]\|^\)[.]gconfd3\b:d
\:\([ ]\|^\)[.]gegl\b:d
\:\([ ]\|^\)[.]gimp-2.8\b:d
\:\([ ]\|^\)[.]gitconfig\b:d
\:\([ ]\|^\)[.]gnome\b:d
\:\([ ]\|^\)[.]gnupg\b:d
\:\([ ]\|^\)[.]gnuplot\b:d
\:\([ ]\|^\)[.]gstreamer\b:d
\:\([ ]\|^\)[.]icedtea\b:d
\:\([ ]\|^\)[.]icons\b:d
\:\([ ]\|^\)[.]java\b:d
\:\([ ]\|^\)[.]jetadmin\b:d
\:\([ ]\|^\)[.]jpi_cache\b:d
\:\([ ]\|^\)[.]jssc\b:d
\:\([ ]\|^\)[.]kde\b:d
\:\([ ]\|^\)[.]local\b:d
\:\([ ]\|^\)[.]mailboxlist\b:d
\:\([ ]\|^\)[.]mailcap\b:d
\:\([ ]\|^\)[.]mcop\b:d
\:\([ ]\|^\)[.]mcoprc\b:d
\:\([ ]\|^\)[.]metacity\b:d
\:\([ ]\|^\)[.]mime-types\b:d
\:\([ ]\|^\)[.]mozilla\b:d
\:\([ ]\|^\)[.]mplayer\b:d
\:\([ ]\|^\)[.]nautilus\b:d
\:\([ ]\|^\)[.]nedit\b:d
\:\([ ]\|^\)[.]netscape\b:d
\:\([ ]\|^\)[.]nv\b:d
\:\([ ]\|^\)[.]openoffice\b:d
\:\([ ]\|^\)[.]pki\b:d
\:\([ ]\|^\)[.]povray\b:d
\:\([ ]\|^\)[.]thumbnails\b:d
\:\([ ]\|^\)[.]thunderbird\b:d
\:\([ ]\|^\)[.]Xauthority\b:d

\:\([ ]\|^\)snap/chromium\b:d

# Trash config subfolders:
\:\([ ]\|^\)[.]config/caja\b:d
\:\([ ]\|^\)[.]config/chromium\b:d
\:\([ ]\|^\)[.]config/flowblade\b:d
\:\([ ]\|^\)[.]config/google\b:d
\:\([ ]\|^\)[.]config/libreoffice\b:d
\:\([ ]\|^\)[.]config/macromedia\b:d
\:\([ ]\|^\)[.]config/pulse\b:d
\:\([ ]\|^\)[.]config/texlive2013\b:d
\:\([ ]\|^\)[.]config/Qt:d
\:\([ ]\|^\)[.]config/Skype\b:d
\:\([ ]\|^\)[.]config/Trolltech:d
\:\([ ]\|^\)[.]config/akonadi:d
\:\([ ]\|^\)[.]config/atril\b:d
\:\([ ]\|^\)[.]config/caja\b:d
\:\([ ]\|^\)[.]config/chromium\b:d
\:\([ ]\|^\)[.]config/compiz\b:d
\:\([ ]\|^\)[.]config/dnfdragora\b:d
\:\([ ]\|^\)[.]config/enchant\b:d
\:\([ ]\|^\)[.]config/evince\b:d
\:\([ ]\|^\)[.]config/exaile\b:d
\:\([ ]\|^\)[.]config/fontforge\b:d
\:\([ ]\|^\)[.]config/gnote\b:d
\:\([ ]\|^\)[.]config/gnuplot\b:d
\:\([ ]\|^\)[.]config/google-chrome\b:d
\:\([ ]\|^\)[.]config/guvcview:d
\:\([ ]\|^\)[.]config/inkscape:d
\:\([ ]\|^\)[.]config/kalarmrc:d
\:\([ ]\|^\)[.]config/kconf:d
\:\([ ]\|^\)[.]config/libreoffice:d
\:\([ ]\|^\)[.]config/octave\b:d
\:\([ ]\|^\)[.]config/pavucontrol:d
\:\([ ]\|^\)[.]config/pluma\b:d
\:\([ ]\|^\)[.]config/sealert\b:d
\:\([ ]\|^\)[.]config/specialmailcollectionsrc\b:d
\:\([ ]\|^\)[.]config/xfburn\b:d
\:\([ ]\|^\)[.]config/xfce4\b:d
\:\([ ]\|^\)[.]config/yumex:d


# Mirror directories:
\:\([ ]\|^\)PUB/include/:d
\:\([ ]\|^\)stolfi/include/:d

# Expendable extensions, such as derived files:
\:[.][ao]$:d
\:[.]pyc$:d
\:[.]io$:d
\:[.]ho$:d
\:[.]so$:d
\:[.]aux$:d
\:[.]errs$:d
\:[.]log$:d
\:[.]blg$:d
\:[.]toc$:d
\:[.]dvi$:d
\:[.]lof$:d
\:[.]lot$:d
\:[.]dmg$:d
\:[.]done$:d
\:[.]did$:d
\:[.]diff$:d
\:[.]prdiff$:d
\:[.]so[.][0-9][0-9]*$:d
\:[.]class$:d

# Trash subfolders of any folder:
\:\([ /]\|^\)JUNK:d
\:\([ /]\|^\)[.]macromedia/:d
\:\([ /]\|^\)[.]svn([ /]|$):d
\:\([ /]\|^\)[.]git([ /]|$):d
\:\([ /]\|^\)__pycache__/:d
\:\([ /]\|^\)akonadi:d
\:\([ /]\|^\)dropbox:d
\:\([ /]\|^\)exaile:d
\:\([ /]\|^\)out[.]old/:d
\:\([ /]\|^\)out[_0-9]*/:d
\:\([ /]\|^\)testes/saida/:d

# Trash dotfiles in top folder and subfolders:
\:\([ /]\|^\)[.][uvwxyz][uvwxyz]\b:d
\:\([ /]\|^\)[.]absent\b:d
\:\([ /]\|^\)[.]adobe\b:d
\:\([ /]\|^\)[.]all-:d
\:\([ /]\|^\)[.]all-files-:d
\:\([ /]\|^\)[.]all\b:d
\:\([ /]\|^\)[.]asz\b:d
\:\([ /]\|^\)[.]audacity:d
\:\([ /]\|^\)[.]cache\b:d
\:\([ /]\|^\)[.]ccache\b:d
\:\([ /]\|^\)[.]dasher\b:d
\:\([ /]\|^\)[.]dbus\b:d
\:\([ /]\|^\)[.]deleted\b:d
\:\([ /]\|^\)[.]deps[.]make$\b:d
\:\([ /]\|^\)[.]designer\b:d
\:\([ /]\|^\)[.]did\b:d
\:\([ /]\|^\)[.]diff\b:d
\:\([ /]\|^\)[.]done\b:d
\:\([ /]\|^\)[.]dropbox-dist\b:d
\:\([ /]\|^\)[.]dropbox\b:d
\:\([ /]\|^\)[.]dt\b:d
\:\([ /]\|^\)[.]dup\b:d
\:\([ /]\|^\)[.]dups:d
\:\([ /]\|^\)[.]eXtace\b:d
\:\([ /]\|^\)[.]eggcups\b:d
\:\([ /]\|^\)[.]empty\b:d
\:\([ /]\|^\)[.]esd_auth\b:d
\:\([ /]\|^\)[.]esmtp_queue\b:d
\:\([ /]\|^\)[.]evolution\b:d
\:\([ /]\|^\)[.]extra\b:d
\:\([ /]\|^\)[.]flowblade\b:d
\:\([ /]\|^\)[.]fontconfig\b:d
\:\([ /]\|^\)[.]fullcircle\b:d
\:\([ /]\|^\)[.]galeon\b:d
\:\([ /]\|^\)[.]gconf3\b:d
\:\([ /]\|^\)[.]gconfd3\b:d
\:\([ /]\|^\)[.]gegl-:d
\:\([ /]\|^\)[.]gimp-2.8\b:d
\:\([ /]\|^\)[.]gitconfig\b:d
\:\([ /]\|^\)[.]gnome2\b:d
\:\([ /]\|^\)[.]gnome2_private\b:d
\:\([ /]\|^\)[.]gnome3\b:d
\:\([ /]\|^\)[.]gnome\b:d
\:\([ /]\|^\)[.]gnupg\b:d
\:\([ /]\|^\)[.]gnuplot_history\b:d
\:\([ /]\|^\)[.]gstreamer-:d
\:\([ /]\|^\)[.]icedteaplugin\b:d
\:\([ /]\|^\)[.]icons\b:d
\:\([ /]\|^\)[.]java\b:d
\:\([ /]\|^\)[.]jetadmin\b:d
\:\([ /]\|^\)[.]jpi_cache\b:d
\:\([ /]\|^\)[.]jssc\b:d
\:\([ /]\|^\)[.]kde3\b:d
\:\([ /]\|^\)[.]kde\b:d
\:\([ /]\|^\)[.]lesshst\b:d
\:\([ /]\|^\)[.]local\b:d
\:\([ /]\|^\)[.]lock\b:d
\:\([ /]\|^\)[.]macromedia\b:d
\:\([ /]\|^\)[.]mailboxlist\b:d
\:\([ /]\|^\)[.]mailcap\b:d
\:\([ /]\|^\)[.]mcop\b:d
\:\([ /]\|^\)[.]mcoprc\b:d
\:\([ /]\|^\)[.]metacity\b:d
\:\([ /]\|^\)[.]mime-types\b:d
\:\([ /]\|^\)[.]miss:d
\:\([ /]\|^\)[.]mozilla\b:d
\:\([ /]\|^\)[.]mplayer\b:d
\:\([ /]\|^\)[.]nautilus\b:d
\:\([ /]\|^\)[.]nedit\b:d
\:\([ /]\|^\)[.]netscape6\b:d
\:\([ /]\|^\)[.]nv\b:d
\:\([ /]\|^\)[.]openoffice:d
\:\([ /]\|^\)[.]ok:d
\:\([ /]\|^\)[.]pee\b:d
\:\([ /]\|^\)[.]pki\b:d
\:\([ /]\|^\)[.]played\b:d
\:\([ /]\|^\)[.]povray\b:d
\:\([ /]\|^\)[.]pretty\b:d
\:\([ /]\|^\)[.]pulse\b:d
\:\([ /]\|^\)[.]qt\b:d
\:\([ /]\|^\)[.]redhat\b:d
\:\([ /]\|^\)[.]rhn-applet.conf\b:d
\:\([ /]\|^\)[.]rhopenoffice1.1\b:d
\:\([ /]\|^\)[.]spamassasin\b:d
\:\([ /]\|^\)[.]ssh2\b:d
\:\([ /]\|^\)[.]ssh\b:d
\:\([ /]\|^\)[.]sss\b:d
\:\([ /]\|^\)[.]subversion\b:d
\:\([ /]\|^\)[.]sversionrc\b:d
\:\([ /]\|^\)[.]teminfo\b:d
\:\([ /]\|^\)[.]texlive2007\b:d
\:\([ /]\|^\)[.]texlive2013\b:d
\:\([ /]\|^\)[.]texlive2017\b:d
\:\([ /]\|^\)[.]texlive\b:d
\:\([ /]\|^\)[.]themes\b:d
\:\([ /]\|^\)[.]thumbnails\b:d
\:\([ /]\|^\)[.]tmp\b:d
\:\([ /]\|^\)[.]to-del\b:d
\:\([ /]\|^\)[.]toadd\b:d
\:\([ /]\|^\)[.]todel\b:d
\:\([ /]\|^\)[.]torm\b:d
\:\([ /]\|^\)[.]uha\b:d
\:\([ /]\|^\)[.]units_history\b:d
\:\([ /]\|^\)[.]ups\b:d
\:\([ /]\|^\)[.]vm\b:d
\:\([ /]\|^\)[.]vnc\b:d
\:\([ /]\|^\)[.]w3\b:d
\:\([ /]\|^\)[.]wapi\b:d
\:\([ /]\|^\)[.]wget-hsts\b:d
\:\([ /]\|^\)[.]xcdroast\b:d
\:\([ /]\|^\)[.]xchat\b:d
\:\([ /]\|^\)[.]xdvirc\b:d
\:\([ /]\|^\)[.]xfig\b:d
\:\([ /]\|^\)[.]xfigrc\b:d
\:\([ /]\|^\)[.]xmms\b:d
\:\([ /]\|^\)[.]xscreensaver\b:d
\:\([ /]\|^\)[.]xsession-errors:d

# Other trash files:
\:\([ /]\|^\)[,]:d
\:\([ /]\|^\)[\#].*[\#]$:d
\:/[=][=][=]*$:d

# Directories that are already backups:
\:\([ ]\|^\)pkg:d

EOF

if [[ ${extilde} -ne 0 ]]; then
  echo '\:[~]$:d' >> ${xfile}
fi

cat ${HOME}/programs/binaries.dir \
  | sed \
      -e 's@^@\\:programs/@g' \
      -e 's@$@:d@g' \
  >> ${xfile}
  
echo "see ${xfile}" 1>&2

/bin/sed -f ${xfile} 

echo "done." 1>&2

# /bin/rm -f ${xfile}
