#!/bin/sh # # @(#) /u/des/src/sortmail/decomposemail 1.21 08/11/17 17:59:22 # # decomposemail: script for decomposing a mailbox into files with individual # messages, named for easy sorting by date:time. # # D.Singer, 10/95 # # # Copyright (c) 1997 by Daniel E. Singer. All rights reserved. # Permission is granted to reproduce and distribute this program # with the following conditions: # 1) This copyright notice and the author identification below # must be left intact in the program and in any copies. # 2) Any modifications to the program must be clearly identified # in the source file. # # Written by: # Daniel E. Singer # UNIX Systems Administrator # Department of Computer Science # Duke University, Durham, NC # Phone: 919/660-6500 # Email: des@cs.duke.edu # PATH='/usr/bin:/usr/sbin:/bin:/usr/ucb:/usr/bsd:/usr/etc:/usr/local/bin:/usr/gnu/bin' export PATH PROG=`basename "$0"` umask 077 # create files and dirs with restrictive perms, # since this involves email; FILE_BASE_NAME= MAIL_FILE= SUFFIX= CAT= # most awks can't handle very long input line lengths, so we might need # to hack around this AWK_INLINE_MAX= AWK_INLINE_MAX_MAX=16383 # don't bother to count beyond this GAWK=0 # these are used to first look for 'gawk', # else look for some other Awk; # might need to be adjusted per site; # the first group is tested for a valid gawk version return string; # the second group is tested for existence only; GAWK_PASS="awk gawk nawk /usr/local/bin/awk /usr/local/bin/gawk /usr/gnu/awk /usr/gnu/gawk" AWK_PASS="nawk /usr/local/bin/nawk awk /usr/local/bin/awk" VFLAG=0 VECHO=":" CHUNK=0 CNUM=0 LIMIT=0 LNUM=0 RECOMPOSE=0 RECOMPOSE_SUFFIX=".sort" SKIPMSGS=0 SNUM=0 USAGE=" Usage: $PROG [-hrv] [-c num] [-l num] [-s num] [file...] -c split the mailbox into num-message chunks, does not sort; -h help, print this message and exit; -l stop after processing num messages (does not count messages skipped with -s); -r recompose, ignored with -c; -s skip the first num messages; -v verbose, more messages; file pathname of a mail file, \"-\" for stdin; " # pattern for the extension added to message filenames EXT_PAT='[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9].[0-9][0-9][0-9][0-9][0-9][0-9].[0-9][0-9][0-9][0-9][0-9]*' # these might vary for some OS's AWK=awk # get a preliminary one for option processing FOLD="fold -b" SYS=`uname -sr` case "$SYS" in "FreeBSD "*) FOLD="fold" ;; "SunOS 4"*) AWK=nawk FOLD="fold" ;; "SunOS "*) AWK=nawk esac # # process command line options # SYNTAX="$PROG: option syntax error." syntax_error() { echo "$SYNTAX" >&2 echo "$USAGE" >&2 exit 1 } arg_syntax_check() { [ "$1" -lt 1 ] && syntax_error } while [ "$#" -gt 0 ]; do OPT="$1" case "$OPT" in # options without argument '-') # stdin break ;; -r) RECOMPOSE=1 ;; -v) VFLAG=1 VECHO="echo" ;; -h) echo "$USAGE" >&2 exit 0 ;; # options with argument -c) shift arg_syntax_check "$#" CHUNK=1 CNUM="$1" case "$CNUM" in [1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;; *) syntax_error esac ;; -l) shift arg_syntax_check "$#" LIMIT=1 LNUM="$1" case "$LNUM" in [1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;; *) syntax_error esac ;; -s) shift arg_syntax_check "$#" SKIPMSGS=1 SNUM="$1" case "$SNUM" in [1-9]|[1-9][0-9]|[1-9][0-9][0-9]|[1-9][0-9][0-9][0-9]|[1-9][0-9][0-9][0-9][0-9]) ;; *) syntax_error esac ;; # ... --) shift break ;; # unknown option -?) syntax_error ;; # compound option -??*) # break up a compound option NEW_OPTS=`$AWK 'BEGIN { OPT_STR = "'"$OPT"'"; LEN = length(OPT_STR); NEW_OPTS = ""; STATUS = 0; for (POS=2; POS+0 <= LEN; ++POS) { OPT = substr(OPT_STR,POS,1); if (OPT !~ /[a-zA-Z0-9_]/) STATUS = 1; NEW_OPTS = NEW_OPTS " -" OPT; } print NEW_OPTS; exit STATUS; }' <&-` || { syntax_error } shift set -- $NEW_OPTS ${1:+"$@"} continue ;; # end of options, just command arguments left *) break esac shift done # # check for mailfile arguments # if [ "$#" = 0 ]; then # read mail from stdin set -- - fi # # get the right 'awk' command; # Perl fans, please stop snickering; # AWK= # first try to locate a copy of GNU Awk, which won't have # annoying limitations for TAWK in $GAWK_PASS; do # first see if it exists AWKSTR=`type "$TAWK" 2>&1` || continue # some versions of 'type' won't return non-zero exit status case "$AWKSTR" in *" not found") continue esac # then see if it returns a version string we're expecting AWKSTR=`$TAWK --version <&- 2>&1` && case "$AWKSTR" in #*"gawk"*|*"Gnu Awk"*|*"GNU Awk"*) *"Gnu Awk"*|*"GNU Awk"*) AWK="$TAWK" GAWK=1 break esac done if [ "$GAWK" = 0 ]; then for TAWK in $AWK_PASS; do if type "$TAWK" >&-; then AWK="$TAWK" break fi done if [ -z "$AWK" ]; then echo "^G$PROG: cannot find AWK command." >&2 exit 1 fi # # find AWK max input line len; # some email has _really_ long lines, so we'll have to fold # any such lines so they don't break AWK; # (hopefully the shell won't break first!) # LINE= PLINE= PLEN=0 LEN=0 # count by 100's, 10's, and 1's for STR in "1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890" "1234567890" "1"; do while true; do PLINE="$LINE" PLEN="$LEN" LINE="$LINE$STR" LEN=`echo "$LINE" | $AWK 2>&- '{ print length; }'` || break [ "$LEN" -lt "$AWK_INLINE_MAX_MAX" ] || break done LINE="$PLINE" LEN="$PLEN" done AWK_INLINE_MAX="$LEN" fi for MAIL_FILE do # might need to uncompress CAT='cat' case "$MAIL_FILE" in '-') MAIL_FILE= FILE_BASE_NAME="stdin" ;; *) if [ -z "$MAIL_FILE" -o ! -r "$MAIL_FILE" \ -o ! -f "$MAIL_FILE" ]; then echo "$PROG: cannot read \"$MAIL_FILE\"." >&2 exit 1 fi SUFFIX= case "$MAIL_FILE" in *".Z") CAT='uncompress' SUFFIX=".Z" ;; *".gz") CAT='gunzip' SUFFIX=".gz" esac FILE_BASE_NAME="`basename \"$MAIL_FILE\" \"$SUFFIX\"`" esac $VECHO "$PROG: Processing \"$FILE_BASE_NAME\"" >&2 # # deal with input correctly, and # accommodate AWK max input line len if necessary; # case "$MAIL_FILE:$GAWK" in :1) # no file, and gawk: # just get stdin $CAT ;; *:1) # is file, and gawk: # get file, possibly uncompressing $CAT < "$MAIL_FILE" ;; :0) # no file, and no gawk: # fold stdin $FOLD -$AWK_INLINE_MAX ;; *:0) # is file, and no gawk: # fold file, possibly uncompressing $CAT < "$MAIL_FILE" | $FOLD -$AWK_INLINE_MAX esac | $AWK ' BEGIN { PROG = "'"$PROG"'"; CNUM = '"$CNUM"'; LNUM = '"$LNUM"'; SNUM = '"$SNUM"'; VFLAG = '"$VFLAG"'; TMP_FILE_BASE = "'"$FILE_BASE_NAME"'"; TMP_FILE = ""; IN_TMP_FILE = 0; #CENTURY = "'"`date +%y`"'"; #CENTURY = (CENTURY+0 < 70) ? "20" : "19"; # Sun May 31 13:11:59 EDT 1998 # 1234567890123456789012345678 # ^ CENTURY = substr("'"`date`"'",25,2); MON["Jan"] = "01"; MON["May"] = "05"; MON["Sep"] = "09"; MON["Feb"] = "02"; MON["Jun"] = "06"; MON["Oct"] = "10"; MON["Mar"] = "03"; MON["Jul"] = "07"; MON["Nov"] = "11"; MON["Apr"] = "04"; MON["Aug"] = "08"; MON["Dec"] = "12"; # just in case... MON["JAN"] = "01"; MON["MAY"] = "05"; MON["SEP"] = "09"; MON["FEB"] = "02"; MON["JUN"] = "06"; MON["OCT"] = "10"; MON["MAR"] = "03"; MON["JUL"] = "07"; MON["NOV"] = "11"; MON["APR"] = "04"; MON["AUG"] = "08"; MON["DEC"] = "12"; DOW["Sun"] = "1"; DOW["Wed"] = "4"; DOW["Fri"] = "6"; DOW["Mon"] = "2"; DOW["Thu"] = "5"; DOW["Sat"] = "7"; DOW["Tue"] = "3"; DOW["SUN"] = "1"; DOW["WED"] = "4"; DOW["FRI"] = "6"; DOW["MON"] = "2"; DOW["THU"] = "5"; DOW["SAT"] = "7"; DOW["TUE"] = "3"; # assume that input begins at the start of a message, # so these are set to mimic end of a message WAS_BLANK = 1; # last line was a blank, or beginning-of-file IN_HEADER = 0; # currently in message header #GOT_DATE = 1; # have gotten a "Date:" line MSG_COUNT = 0; # current message, first is "1" #C_COUNT = 0; # current message in "chunk", first is "1" C_COUNT = 0; # current "chunk", first is "1" L_COUNT = 0; # limit count, first is "1" } { # # start of a message header; # first line of input should match this; # Eg: From des@cs.duke.edu Fri May 1 02:09:48 1998 # Eg: From des@cs.duke.edu Fri May 1 02:09 EDT 1998 # if (WAS_BLANK && $0 ~ /^From / && $3 ~ /^(Sun)|(Mon)|(Tue)|(Wed)|(Thu)|(Fri)|(Sat)$/ && $4 ~ /^(Jan)|(Feb)|(Mar)|(Apr)|(May)|(Jun)|(Jul)|(Aug)|(Sep)|(Oct)|(Nov)|(Dec)$/ && $5 ~ /^[0-9][0-9]?$/ && $6 ~ /^[0-9][0-9]?:[0-9][0-9]((:[0-9][0-9]?)|())$/ && (($7 ~ /^[A-Z][A-Z][A-Z]$/ && $8 ~ /^(19[6-9])|(20[0-9])[0-9]$/) || $7 ~ /^(19[6-9])|(20[0-9])[0-9]$/)) { # # if limit is exceeded, exit like end-of-file; # if (LNUM+0 > 0 && L_COUNT+0 >= LNUM) { # # need to suck up the rest of the input to # avoid a "broken pipe" message # while (getline == 1) ; exit; } # # wrap up the previous message; # do this again at EOF; # if (MSG_COUNT+0 > SNUM+0) if (CNUM+0 <= 0) end_of_message(); # initialize for beginning of message IN_HEADER = 1; WAS_BLANK = 0; DATE_LINE = $0; # hang on to this for date, just in # case there is no date line ++MSG_COUNT; if (MSG_COUNT+0 <= SNUM+0) next; ++L_COUNT; # # if doing chunks, start a new file when chunk # count is exceeded. # if (CNUM+0 > 0) { #printf("%5d\n",((MSG_COUNT-(SNUM+1)) % CNUM)); if (((MSG_COUNT - (SNUM+1)) % CNUM) == 0) { if (C_COUNT+0 > 0) close_tmp_file(); ++C_COUNT; TMP_FILE = sprintf("%s-%05d",TMP_FILE_BASE,C_COUNT); if (VFLAG == 1) printf("\n%s: beginning %s\n",PROG,TMP_FILE); #system( "cp /dev/null \"" TMP_FILE "\" && chmod 600 \"" TMP_FILE "\"" ); system( "cp /dev/null \"" TMP_FILE "\"" ); } if (VFLAG == 1) printf("%s: %s (%s). %s\n",PROG,((MSG_COUNT-(SNUM+1))%CNUM)+1,MSG_COUNT,TMP_FILE); } # # otherwise, make a new temp file for each new message # else { # initialize file name for this message; # will be renamed to include date.time; #TMP_FILE = TMP_FILE_BASE "." sprintf("%05d",MSG_COUNT); TMP_FILE = sprintf("%s.%05d",TMP_FILE_BASE,MSG_COUNT); #system("cp /dev/null " TMP_FILE); #system( "chmod 600 " TMP_FILE ); #system( "cp /dev/null \"" TMP_FILE "\" && chmod 600 \"" TMP_FILE "\"" ); system( "cp /dev/null \"" TMP_FILE "\"" ); } print > TMP_FILE; IN_TMP_FILE = 1; next; } # # not start of header... # # # blank line; just print it and go on; # if ($0 == "") { WAS_BLANK = 1; IN_HEADER = 0; if (MSG_COUNT+0 <= SNUM+0) next; print > TMP_FILE; IN_TMP_FILE = 1; next; } # # date line: this is what is used to serialize # the messages; # if (IN_HEADER == 1 && $0 ~ /^Date: /) { DATE_LINE = $0; WAS_BLANK = 0; if (MSG_COUNT+0 <= SNUM+0) next; print > TMP_FILE; IN_TMP_FILE = 1; next; } # # any other lines just get printed out; # WAS_BLANK = 0; if (MSG_COUNT+0 <= SNUM+0) next; print > TMP_FILE; IN_TMP_FILE = 1; next; } # # get the date for serializing files # function do_date() { # 0 = no recognizable line # -1 = bad format # 1 = parsable, at least somewhat DATE_OK = 0; NUM_TOKENS = split(DATE_LINE,STRS); if (STRS[1] == "Date:") { #print ""; #printf("%s: %s. %s\n",PROG,MSG_COUNT,DATE_LINE); DATE_OK = -1; DOW_FOUND = 0; DATE_FOUND = 0; MONTH_FOUND = 0; YEAR_FOUND = 0; TIME_FOUND = 0; ZONE_FOUND = 0; NUM_FOUND = 0; DATE = ""; MONTH = ""; YEAR = ""; TIME = ""; # check for very weird case, which can be coerced # Date: 30-MAY-1990 16:13:49.88 if (STRS[2] ~ /^[0-9]+-[A-Za-z]+-[0-9]+$/\ && STRS[3] ~ /^[0-9]+:[0-9]+/) { DLT = DATE_LINE; sub("-"," ",DLT); sub("-"," ",DLT); NUM_TOKENS = split(DLT,STRS); } for (TN=2; TN <= NUM_TOKENS; ++TN) { TOKEN = STRS[TN]; TOK3 = substr(TOKEN,1,3); if (DOW_FOUND == 0) { if (DOW[TOK3] != "") { DOW_FOUND = 1; continue; } } if (MONTH_FOUND == 0) { if (MON[TOK3] != "") { #MONTH = TOK3; #MONTH = MON[TOK3]; MONTH = TOKEN; MONTH_FOUND = 1; ++NUM_FOUND; continue; } } if (DATE_FOUND == 0) { if (TOKEN ~ /^[0-9]+$/) { DATE = TOKEN; DATE_FOUND = 1; ++NUM_FOUND; continue; } } if (YEAR_FOUND == 0) { if (TOKEN ~ /^[0-9][0-9]+/ \ && TOKEN !~ /:/) { YEAR = TOKEN; YEAR_FOUND = 1; ++NUM_FOUND; continue; } } if (TIME_FOUND == 0) { if (TOKEN ~ /^[0-9][0-9:]+/ \ || TOKEN ~ /^[0-9][0-9][0-9][0-9]/) { TIME = TOKEN; TIME_FOUND = 1; ++NUM_FOUND; continue; } } #if (ZONE_FOUND == 0) { if (TOKEN ~ /^\(?[A-Z][A-Z][A-Z]/ \ || TOKEN ~ /^[+-]?[0-9][0-9][0-9]/) { ZONE_FOUND = 1; continue; } #} } if (DATE_FOUND == 1 && YEAR_FOUND == 0 && DATE+0 > 31) { YEAR = DATE; DATE = ""; YEAR_FOUND = 1; DATE_FOUND = 0; } if (NUM_FOUND+0 >= 3) DATE_OK = 1; } else if (STRS[1] == "From") { DATE_OK = -1; # From dsqa@ncat.edu Mon Apr 17 16:29:15 1995 # From dsqa@ncat.edu Mon Apr 17 16:29 EST 1995 printf("%s: %s. %s\n",PROG,MSG_COUNT,"using date from \"From \" line."); OFFSET = 4; MONTH = STRS[0+OFFSET]; DATE = STRS[1+OFFSET]; TIME = STRS[2+OFFSET]; YEAR = STRS[3+OFFSET]; DATE_OK = 1; } if (DATE_OK != 1) { if (DATE_OK == 0) printf("%s: %s. NO DATE !!\n",PROG,MSG_COUNT); else printf("%s: %s. bad date line: [%s]\n",PROG,MSG_COUNT,DATE_LINE); MONTH = "00"; DATE = "00"; YEAR = "1000"; TIME = "00:00:00"; HOUR = "00"; MINUTE = "00"; SECOND = "00"; DATE_STR = YEAR MONTH DATE; TIME_STR = HOUR MINUTE SECOND; SERIAL = sprintf("%s.%s.%05d",DATE_STR,TIME_STR,MSG_COUNT); return; } DATE_LINE = ""; # get rid of inappropriate stuff tacked on sub(/[^0-9].*/,"",DATE); sub(/[^0-9].*/,"",YEAR); sub(/[^0-9:].*/,"",TIME); if (DATE+0 < 1 || DATE+0 > 31) printf("%s: %s. bad date: [%s]\n",PROG,MSG_COUNT,DATE); DATE = sprintf("%02d",DATE); # one month had a period at the end MONTH_HOLD = MONTH; MONTH = substr(MONTH,1,3); MONTH = MON[MONTH]; if (MONTH == "") { printf("%s: %s. bad month: [%s]\n",PROG,MSG_COUNT,MONTH_HOLD); MONTH = "00"; } if (YEAR+0 < 70 || (YEAR+0 > 99 && YEAR+0 < 1970) || YEAR+0 > 2025) printf("%s: %s. bad year: [%s]\n",PROG,MSG_COUNT,YEAR); if (YEAR+0 < 100) YEAR = sprintf("%s%02d",CENTURY,YEAR); else if (YEAR+0 < 1000) YEAR = sprintf("2%03d",YEAR); DATE_STR = YEAR MONTH DATE; if (TIME ~ /^[0-9]+$/) TIME = substr(TIME,1,2) ":" substr(TIME,3,2) ":" substr(TIME,5,2); N = split(TIME,TIME_ARRAY,":"); HOUR = TIME_ARRAY[1]; MINUTE = TIME_ARRAY[2]; SECOND = TIME_ARRAY[3]; HOUR = (N >= 1 && HOUR ~ /^[0-9]+$/) ? HOUR : "99"; MINUTE = (N >= 2 && MINUTE ~ /^[0-9]+$/) ? MINUTE : "99"; #SECOND = (N >= 3 && SECOND ~ /^[0-9]+$/) ? SECOND : "99"; SECOND = (N >= 3 ? ((SECOND ~ /^[0-9]+$/) ? SECOND : "99") : "00"); #HOUR = sprintf("%02d",N+0 >= 1 ? TIME_ARRAY[1] : 99); #MINUTE = sprintf("%02d",N+0 >= 2 ? TIME_ARRAY[2] : 99); #SECOND = sprintf("%02d",N+0 >= 3 ? TIME_ARRAY[3] : 99); if (N+0 < 1 || N+0 > 3) printf("%s: %s. bad time: [%s]\n",PROG,MSG_COUNT,TIME); #if (N+0 > 0 && (HOUR+0 < 0 || HOUR+0 > 23)) { if ((HOUR+0 < 0 || HOUR+0 > 23)) { printf("%s: %s. bad hour: [%s]\n",PROG,MSG_COUNT,HOUR); HOUR = 0; } #if (N+0 > 1 && (MINUTE+0 < 0 || MINUTE+0 > 59)) { if ((MINUTE+0 < 0 || MINUTE+0 > 59)) { printf("%s: %s. bad minute: [%s]\n",PROG,MSG_COUNT,MINUTE); MINUTE = 0; } #if (N+0 > 2 && (SECOND+0 < 0 || SECOND+0 > 59)) { if ((SECOND+0 < 0 || SECOND+0 > 59)) { printf("%s: %s. bad second: [%s]\n",PROG,MSG_COUNT,SECOND); SECOND = 0; } TIME_STR = sprintf("%02d%02d%02d",HOUR,MINUTE,SECOND); SERIAL = sprintf("%s.%s.%05d",DATE_STR,TIME_STR,MSG_COUNT); } # # what to do at EOM # function close_tmp_file() { if (IN_TMP_FILE) { close(TMP_FILE); IN_TMP_FILE = 0; } } # # what to do at EOM # function end_of_message() { if (MSG_COUNT+0 > 0) { close_tmp_file(); do_date(); NEW_FILE = sprintf("%s.%s",TMP_FILE_BASE,SERIAL); if (VFLAG == 1) printf("%s: %s. %s => %s\n",PROG,MSG_COUNT,TMP_FILE,NEW_FILE); EOM_STR = sprintf("mv \"%s\" \"%s\"",TMP_FILE,NEW_FILE); #print EOM_STR; system( EOM_STR ); } } END { if (MSG_COUNT+0 > SNUM+0) { if (CNUM+0 > 0) close_tmp_file(); else end_of_message(); } #if (VFLAG == 1) { if (LNUM+0 > 0 && L_COUNT+0 >= LNUM) printf("%s: limit %d reached.\n",PROG,LNUM); printf("%s: processed %d messages in \"%s\".\n",PROG,MSG_COUNT,TMP_FILE_BASE); #} } ' >&2 # # if RECOMPOSE, recombine the messages into a new file # if [ "$RECOMPOSE" '=' '1' -a "$CHUNK" '=' '0' ]; then if [ -f "$FILE_BASE_NAME$RECOMPOSE_SUFFIX" ]; then EXT=1 while [ -f "$FILE_BASE_NAME$RECOMPOSE_SUFFIX.$EXT" ]; do EXT=`expr "$EXT" '+' '1'` done mv "$FILE_BASE_NAME$RECOMPOSE_SUFFIX" "$FILE_BASE_NAME$RECOMPOSE_SUFFIX.$EXT" fi $VECHO "$PROG: recomposing messages to \"$FILE_BASE_NAME$RECOMPOSE_SUFFIX\"." >&2 # # Note: this should probably be replaced with something like # ls | grep pat | while read FILE; do ... done # cat "$FILE_BASE_NAME".$EXT_PAT > "$FILE_BASE_NAME$RECOMPOSE_SUFFIX" fi done $VECHO "$PROG: done." >&2 exit 0