#!/usr/bin/env python
"""converts matches from bwsearch to filenames via a crtom.starts file.

the bwsearch program reports the positions of a match (with
parameter -r). this information can be converted into more
usefulness with a small index file 'ctrom.starts' which contains
starting positions of the original files in the big concatenated file.
in the 'real world' the search program would report the files itself.

"""

__author__ = 'towi'
__date__   = '2004-12 (initial)'

def readStarts(fnStarts):
    """reads a sorted file like this:
    28:> /var/tmp/towi/ctrom/04/01/003/ART.HTM
    3335:> /var/tmp/towi/ctrom/04/01/006/ART.HTM
    15667:> /var/tmp/towi/ctrom/04/01/010/ART.HTM
    38336:> /var/tmp/towi/ctrom/04/01/012/ART.HTM    
    """
    # read the starts file into two lists.
    lines = [ l.split(':',1) for l in file(fnStarts).readlines() ]
    nums =  [ int(l[0]) for l in lines ]
    lines = [ l[1].strip() for l in lines ]
    # nums = [ num1, nums2, num3, ...] in sorted order
    # lines = [ filename1, filename2, ...] at same positions as in nums
    return nums, lines

def readMatches(fnMatches):
    """reads the positions from input file like this:
    Found 3 occ in 0.00 seconds!
    Occ.    1 is at position  9636059
    Occ.    2 is at position 13280605
    Occ.    3 is at position  3652196
    Located 3 occ's in 0.02 seconds!
    """
    positions = [ int(l.split()[5])
                  for l in file(fnMatches).readlines()
                  if l.startswith('Occ.') ]
    positions.sort()
    return positions
    
    
def main(fnStarts, fnMatches):
    nums, lines = readStarts(fnStarts)
    positions = readMatches(fnMatches)
    import bisect 
    for pos in positions:
        n = bisect.bisect(nums, pos)
        print "%10s is in %s" % (pos, lines[n-1])
    

if __name__ == '__main__':
    import sys
    if len(sys.argv) != 3:
        print "Usage: ./%s startfile matchesfile" % sys.argv[0]
    else:
        main(fnStarts=sys.argv[1], fnMatches=sys.argv[2])
        
