#!/usr/bin/ruby

require 'csv'
require 'optparse'

$options = {
	:verbose => true,
	:fatal => false,
	:output => "output.csv",
	:output2 => "output2.csv"
}
OptionParser.new do |opts|
	opts.banner = "Usage: #{$0} [options] raw.csv corr.csv"

	opts.on("-o", "--output=FILE", "output to FILE") do |file|
		$options[:output] = file
	end
	opts.on("-f", "--fatal", "Stop on first error") do |v|
		$options[:fatal] = v
	end
	opts.on("-q", "--quiet", "Run quietly") do |v|
		$options[:verbose] = false
	end
end.parse!

def log(msg)
	if $options[:verbose]
		puts msg
	end
end

def err(msg)
	puts msg

	if $options[:fatal]
		exit
	end
end

# ncbi -> description
protein_description = {}
# peptide name -> ncbi
peptide_assignment = {}
# peptide index (eg. 12) -> peptide name (eg. "JKEUF")
peptide_name = {}

# 2d matrix, first row and column are the peptide indexes, eg. 12, where 
# peptide_index[12] == "AAPFSLEYR", element (0, 0) is nil
matrix = []

log "loading matrix .."
lineno = 0
CSV.open(ARGV[0], 'r') do |row|
    lineno += 1

    # the first line of the file with the column headers won't match
    if not row[0] =~ /gi\|(\d+)\|(ref|sp)\|([A-Z].*\d+)\|/
        # the first line from col 4 onwards lists the peps in order ... use this
        # to give each pep a unique peptide_index
        peptide_index = 2
        row[4 .. -1].each do |peptide|
            peptide_name[peptide_index] = peptide
            peptide_index += 1
        end

        matrix << [nil] + (2 ... peptide_index).to_a
    else
        accession = $~[1]
        ncbi = $~[3]
        description = row[1]
        peptide = row[3]

        # the desc often has the gi thing at the start
        if description =~ /gi\|(\d+)\|(ref|sp)\|([A-Z].*\d+)\| (.*)/
            description = $~[4]
        end
 
        # puts "peptide = #{peptide}"
        # puts "ncbi = #{ncbi}"
        # puts "description = #{description}"

        protein_description[ncbi] = description
        peptide_assignment[peptide] = ncbi

        # check rows and cols match
        if peptide != peptide_name[lineno] 
            err "line #{lineno} peps don't match"
        end

        # get the line from the pep to the end
        line = [lineno]
        row[4 .. -1].each {|x| line << x.to_f}
        matrix << line
    end
end

log "loaded #{matrix.length} peptides"

# this many or more significant peps in common
# cluster_threshold = 3
# r_cutoff = 0.81

cluster_threshold = 5
r_cutoff = 0.81

# find all peps with a significant correlation with another pep (remember every
# pep will have a 1 with itself)
significant = {}
(1 ... matrix.length).each do |i|
    bits = matrix[i][1 .. -1].map {|x| x ** 2 > r_cutoff ? 1 : 0}

    sum = bits.reduce(:+)
    if sum > cluster_threshold
        significant[i] = bits
    end
end

log "#{significant.length} peps have #{cluster_threshold} or more significant relations"

def bit_and(a, b)
    result = []
    a.zip(b) {|a, b| result <<= a & b}
    result
end

# make a set of singleton clusters, then start joining them up
clusters = {}
cluster_name = 0
significant.each do |i, bits|
    # a pair: the set of bits that the peps in this cluster have in common,
    # and the members of the cluster
    clusters[cluster_name] = [bits, [i]]
    cluster_name += 1
end

# make a distance map ... the distance between every pair of cluster names
distance = Hash.new {|h, k| h[k] = Hash.new}

log "building distance map ..."
clusters.each do |name1, cluster1|
    clusters.each do |name2, cluster2|
        next if name1 == name2

        if distance.has_key?(name1) and
            distance[name1].has_key?(name2) 
            next
        end

        sum = bit_and(cluster1[0], cluster2[0]).reduce(:+)

        distance[name1][name2] = sum
        distance[name2][name1] = sum
    end
end

def write_matrix(peptide_assignment, protein_description, peptide_name, 
    matrix, clusters, filename)
    keys = clusters.keys.sort do |a, b| 
        clusters[b][1].length <=> clusters[a][1].length
    end

    # unpack clusters back into a matrix
    cluster_matrix = [matrix[0]]
    keys.each do |name|
        bits, peps = clusters[name]
        peps.each do |i|
            cluster_matrix << matrix[i]
        end
    end

    # we want to order columns to match our row order
    cluster_matrix = cluster_matrix.transpose
    cluster_matrix = cluster_matrix.sort do |a, b|
        i = cluster_matrix[0].index(a[0])
        j = cluster_matrix[0].index(b[0])

        if i == nil 
            1
        elsif j == nil 
            -1
        else
            i <=> j
        end
    end
    cluster_matrix = cluster_matrix.transpose

    CSV.open(filename, 'w') do |write|
        header_index = cluster_matrix[0][1 ... cluster_matrix.length]
        write << ["NCBI", "Description", "Peptide"] + 
            header_index.map {|x| peptide_name[x]}

        cluster_matrix[1 .. -1].each do |row|
            peptide_index = row[0]
            peptide = peptide_name[peptide_index]
            ncbi = peptide_assignment[peptide]
            description = protein_description[ncbi]
            # write << row
            write << [ncbi, description, peptide] + 
                row[1 .. cluster_matrix.length]
        end
    end
end

# repeatedly loop, joining the closest pair of elements each time

# start the animation at frame 1000 so that we have enough leading zeros to make
# alphabetic sort the same as numerical sort
frame = 1000

while true
    # write the state out for animation
    # comment this out if you're not interested in that
    write_matrix(peptide_assignment, protein_description, peptide_name, 
        matrix, clusters, "frame#{frame}.csv")
    frame += 1

    log "searching #{clusters.length} clusters ..."
    best = 0
    best_name1 = 0
    best_name2 = 0
    clusters.each do |name1, cluster1|
        clusters.each do |name2, cluster2|
            next if name1 == name2

            if distance[name1][name2] > best
                best = distance[name1][name2]
                best_name1 = name1
                best_name2 = name2
            end
        end
    end

    break if best < cluster_threshold

    log "joining clusters #{best_name1} and #{best_name2}, #{best} peps in common ..."

    bits = bit_and(clusters[best_name1][0], clusters[best_name2][0])
    peps = clusters[best_name1][1] + clusters[best_name2][1]
    name = cluster_name
    clusters[name] = [bits, peps]
    cluster_name += 1

    clusters.delete(best_name1)
    clusters.delete(best_name2)

    # and update the distance map

    distance.delete(best_name1)
    distance.delete(best_name2)

    clusters.each do |name2, cluster2|
        bits2, peps2 = cluster2

        sum = bit_and(bits, bits2).reduce(:+)

        distance[name][name2] = sum
        distance[name2][name] = sum
    end
end 

# remove all singletons
clusters.delete_if {|name, cluster| cluster[1].length < 2}

keys = clusters.keys.sort do |a, b| 
    clusters[b][1].length <=> clusters[a][1].length
end

log "#{keys.length} peptide clusters found"

log "writing #{$options[:output]} ..."
CSV.open($options[:output], 'w') do |write|
    keys.each do |name|
        bits, peps = clusters[name]

        row = []
        row << "cluster of #{peps.length} related peptides"
        write << row

        peps.each do |i|
            peptide_index = matrix[i][0]
            peptide = peptide_name[peptide_index]
            ncbi = peptide_assignment[peptide]
            description = protein_description[ncbi]

            row = []
            row << i
            row << peptide
            row << ncbi
            row << description
            write << row
        end

        row = []
        write << row
    end
end

log "writing #{$options[:output2]} ..."
write_matrix(peptide_assignment, protein_description, peptide_name, 
    matrix, clusters, $options[:output2])

