# ~/.procmailrc as a demonstration of using fuzzy checksums
# to recognize emails

# (c) iX 5/2004, Bert Ungerer <un@ix.de>, www.nixspam.org

# 2005-11-07: Now in English. Thanks to Adnan and Sven
# for their suggestions.

# Some versions of md5sum add " -" to the hash. This can be
# removed with tr.

# Some more tightening of the requirements for the hash
# calculation.

# 2005-01-19: Tightened the requirements for hash calculation
# a bit.

# working directory:
MAILDIR="$HOME/Mail/"
LOGFILE=log

# Tested only with this shell:
SHELL="/bin/bash"

# Umlauts have to be treated as characters:
LANG="de_DE"

# File with the saved checksums. Especially interesting
# is one file for multiple users. You should keep the 
# entries not longer than a few weeks. A daily cron job 
# with a tail command will do it.
HASHFILE="mail-hashes"

# Save locally generated hashes. You can use indentically 
# generated hashes from other sources (e. g. the iX spam filter).
SAVE_HASHES="YES"

# Uncomment the following line to save recognized emails seperately:
# KNOWNMAIL="knownmail"

# The first checksum requires at least 1 line break and 16 spaces/tabs:
:0B
* .$+.
* -15^0
* 1^1 [	 ]
{
  :0 bw
  md5hash=|tr -s '[:space:]' \
          |tr -d '[:graph:]' \
          |md5sum \
          |tr -d ' -'
  # 1st hash already generated from a previously received email?
  :0 Aw
  * ? fgrep -s $md5hash $HASHFILE
  { KNOWN=YES }
}

:0B
# Try another checksum if there was no match:
* ! KNOWN ?? YES
# Minimum requirements this time: 3 of the
# following incidences within the mail body.
* -2^0
* 1^1 ([<>()|@*'!?,]|:/)
{ :0 bw
  # Remove numbers, chars, '=', carriage returns,
  # and "%&#;" (often in obfuscated HTML code) and
  # convert underscores into dots:
  md5hash2=|tr -d '[:cntrl:][:alnum:]%&#;=' \
           |tr '_' '.' \
           |tr -s '[:print:]' \
           |md5sum \
           |tr -d ' -'
  # 2nd hash already generated from a previously received email?
  :0 Aw
  * ? fgrep -s $md5hash2 $HASHFILE
  { KNOWN=YES }
  # Merge hashes:
  :0
  * md5hash2 ?? .
  { md5hash="$md5hash	$md5hash2" }
}

:0B
# This is a default checksum for emails without enough
# whitespace or punctuation structure and therefore still no hash:
* ! md5hash ?? .
# Check only if some content exists:
* ........
{
  :0 bw
  md5hash=|tr -d '[:cntrl:][:space:]=' \
          |tr -s '[:graph:]' \
          |md5sum \
          |tr -d ' -'  
:0 Aw
  * ? fgrep -s $md5hash $HASHFILE
  { KNOWN=YES }
}

:0
* KNOWN ?? YES
{
  # Separate recognized emails if desired:
  :0
  * KNOWNMAIL ?? ..
  $KNOWNMAIL
  # Otherwise mark the checksum matches in the subject:
  :0 Efhw
  * ^Subject:\/.*
  | formail -i "Subject: [KNOWN]$MATCH"
  :0 Efhw
  | formail -I "Subject: [KNOWN]"
}
:0Eci:
# Save unrecognized hashes if desired:
* SAVE_HASHES ?? YES
| echo $md5hash >> $HASHFILE

# Save any or only new emails here:
:0
mail
