#! /usr/bin/gawk -f # Last edited on 2007-07-05 18:03:07 by stolfi BEGIN { usage = ( ARGV[0] \ " -v pattern=REGEX \\\n" \ " [-v initial=BOOL] [-v final=BOOL] \\\n" \ " [-v dir=DIRNAME] [-v nextFile=NNNNN] \\\n" \ " < FILE" \ ); # Splits a file into blocks of lines, separated by lines that match # a given RE {pattern}, and writes each block as a separate file. # # If {pattern} contains a parenthesized sub-expression, # then the part of the line matched by that sub-expression is # used as the file name, provided that it is not empty, it does not contain # invalid chars, and it does not name an existing file. # # Otherwise the files are given numeric names starting with # {nextFile}. (default "00000"). Normally numeric names will be # consecutive and will be be zero-padded to the # same length as the initial {nextFile}. However, if the program # runs into an existing numbered file, then {nextFile} is varied # at random. # # If {initial} is set, each matching line is written out as the # first line of the next block. If {final} is set, each matching line # is included as the last line of the preceding block. abort = -1; if (initial == "") { initial = 0; } if (final == "") { final = 0; } if (pattern == "") { arg_error(("must specify \"pattern\"")); } if (dir == "") { dir = "."; } if (nextFile == "") { nextFile = "00000"; } split("", flds); # Strings that match the sub-expressions of {pattern}. blockName = ""; # File name from separator (no {dir}), or "" if not specified. fileName = ""; # Name of current output file (with {dir}), of "" if none. fileStep = 1; # Increment for {nextFile}. totbytes = 0; # Number of bytes read so far. start_new_block(); } (abort >= 0) { exit abort; } // { lin = $0; totbytes += length(lin) + 1; # Process line: if (match(lin, pattern, flds)) { if (final) { write_line(lin); } finish_current_block(); if (lin != "") { printf "%s\n", lin > "/dev/stderr"; } blockName = (1 in flds ? flds[1] : ""); start_new_block(); if (initial) { write_line(lin); } } else { write_line(lin); } next; } function write_line(lin) { if (nlines == 0) { open_file(); } print lin > fileName; nlines++; } END { if (abort >= 0) { exit abort; } finish_current_block(); exit 0; } function start_new_block( ) { # Called before each new file. # Assumes that the previous file has been closed. # Does not open the file yet; wait for the first {print} into it. nlines = 0; } function open_file( len,isnum) { # Called before writing the first line into a new file. # Assumes that the previous file, if any, has been closed. # Assumes that {blockName} is the next file name, or "". # Makes sure that {fileName} is the name of a valid file to write to. # Check whether {blockName} is syntactically valid: if (match(blockName, /[][ \\\/!?;$\&\^\`\'\"\|,<>*(){};]/)) { # Should also exclude [\000-\037\200-\377] printf "file name \"%s\" invalid, ignored", blockName > "/dev/stderr"; blockName = ""; } # Try to set {fileName} to a valid value: isnum = 0; do { # Here {blockName} is syntactically valid or empty. # Ensure {blockName} is non-empty. if (blockName == "") { # Use the next available file number: blockName = nextFile; isnum = 1; # Advance the next file number len = length(nextFile); nextFile = sprintf("%0*d", len, nextFile + fileStep); } # Prepend the directory: fileName = ( dir "/" blockName ); # See if file {fileName} exists: if ((getline lin < fileName) >= 0) { # File exists: close(fileName); printf "file \"%s\" exists, skipped", fileName > "/dev/stderr"; # Cancel this block name: blockName = ""; if (isnum) { # Collision with numeric file name, so increease the increment: fileStep = (int(3*fileStep/2 + totbytes % 11)) + 1; } } else { # File doesn't exist, OK printf "writing %s ... ", fileName > "/dev/stderr"; # Reset file number increment to 1 fileStep = 1; } } while (fileName == ""); } function finish_current_block( ) { if (nlines > 0) { printf " %d lines\n", nlines > "/dev/stderr"; close(fileName); fileName = ""; blockName = ""; } else { printf " no lines, omitted\n" > "/dev/stderr"; } } function data_error(msg) { printf "line %d: %s\n", FNR, msg > "/dev/stderr"; abort = 1; exit abort; } function arg_error(msg) { printf "%s\n", msg > "/dev/stderr"; printf "usage: %s\n", usage > "/dev/stderr"; abort = 1; exit abort; }