--- /dev/null
+#!/bin/python
+"""
+InComparaNoid: Script with different analysis methods used to compare Ensembl and Inparanoid species projections.
+"""
+
+#----------------------------------------------------------------------------------------------------------------------
+# modules
+#----------------------------------------------------------------------------------------------------------------------
+import os
+import sys
+import argparse
+
+#----------------------------------------------------------------------------------------------------------------------
+# globals
+#----------------------------------------------------------------------------------------------------------------------
+ens_map = {}
+inp_map = {}
+
+#----------------------------------------------------------------------------------------------------------------------
+# classes
+#----------------------------------------------------------------------------------------------------------------------
+class switch(object):
+ def __init__(self, value):
+ self.value = value
+ self.fall = False
+
+ def __iter__(self):
+ """Return the match method once, then stop"""
+ yield self.match
+ raise StopIteration
+
+ def match(self, *args):
+ """Indicate whether or not to enter a case suite"""
+ if self.fall or not args:
+ return True
+ elif self.value in args:
+ self.fall = True
+ return True
+ else:
+ return False
+
+#----------------------------------------------------------------------------------------------------------------------
+# functions
+#----------------------------------------------------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------------------------------------------------
+def create_ens_map(ortho_path, rap_map_path, recip_id) :
+#----------------------------------------------------------------------------------------------------------------------
+ """
+ open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where
+ reciprocal identity is >= param% and confidence is high
+ """
+ """
+ count_loc = 0
+ count_prj = 0
+ dict_ens_ids = {}
+ ENS = open(path + input_file)
+ ENS.readline();
+ for line in ENS :
+ cols = line.rstrip().split()
+ if len(cols) == 5 :
+ if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence
+ if cols[0] in dict_ens_ids :
+ dict_ens_ids[cols[0]].append(cols[1])
+ else :
+ dict_ens_ids[cols[0]] = [cols[1]]
+ ENS.close()
+
+ #for k, v in dict_ens_ids.iteritems() :
+ # print k, v
+
+ #OS_2_PRJ_MAP = open(path + output_file,'w')
+
+ RAP_IRGSP = open(path + "RAP-MSU.txt")
+ RAP_IRGSP.readline();
+ for line in RAP_IRGSP:
+ if line.strip() != "" :
+ cols = line.rstrip().split()
+ rap_id = cols[0].upper()
+ if rap_id in dict_ens_ids :
+ print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id])
+ #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n")
+ #count_prj += len(set(dict_ens_ids[rap_id]))
+ #count_loc += 1
+ RAP_IRGSP.close()
+ #OS_2_PRJ_MAP.close()
+
+ return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+ """
+#----------------------------------------------------------------------------------------------------------------------
+def create_inp_map(ortho_path) :
+#----------------------------------------------------------------------------------------------------------------------
+ """
+ open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci
+ """
+ """
+ count_loc = 0
+ count_prj = 0
+ dict_inp_ids = {}
+ INP = open(path + input_file)
+ INP.readline();
+ for line in INP :
+ cols = line.rstrip().split()
+ # ignore non-canonical orthologs
+ if int(cols[1].rsplit(".",1)[1]) > 1 :
+ continue
+ os_locus = cols[0].rstrip(".1")
+ prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes
+ if os_locus in dict_inp_ids :
+ dict_inp_ids[os_locus].append(prj_locus)
+ else :
+ dict_inp_ids[os_locus] = [prj_locus]
+ INP.close()
+
+ OS_2_PRJ_MAP = open(path + output_file,'w')
+ for k, v in sorted(dict_inp_ids.iteritems()) :
+ #print k, v
+ OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n")
+ count_prj += len(set(v))
+ count_loc += 1
+ OS_2_PRJ_MAP.close()
+
+ return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+
+#----------------------------------------------------------------------------------------------------------------------
+def compare_maps(ensembl_map, inparanoid_map, output_path) :
+#----------------------------------------------------------------------------------------------------------------------
+ """compare Ensembl and Inparanoid projection results"""
+
+ inc_os = 0 # Common Os loci
+ exc_ens_os = 0 # Exclusive Ensembl Os loci
+ exc_inp_os = 0 # Exclusive Inparanoid Os loci
+ total_common_prj = 0 # Total Common projected loci
+ inc_common_prj = 0 # Common projected loci where present in both Ens and Inp
+ avg_prj_overlap = 0.0 # Avg. common projected loci
+ avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp
+
+ ref_dict = {}
+ set_ens_os_loci = set()
+ set_inp_os_loci = set()
+
+ """
+ ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]}
+ """
+ # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
+
+ ENS = open(path + input_file.split(',')[0])
+ for line in ENS :
+ cols = line.rstrip().split()
+ os_locus = cols[0]
+ prj_loci = cols[1].split(',')
+
+ # build ref_dict
+ if os_locus in ref_dict :
+ ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot
+ else :
+ ref_dict[os_locus] = [prj_loci, []]
+
+ # build list of os ens loci for set operations
+ set_ens_os_loci.add(os_locus)
+ ENS.close()
+
+ INP = open(path + input_file.split(',')[1])
+ for line in INP :
+ cols = line.rstrip().split()
+ os_locus = cols[0]
+ prj_loci = cols[1].split(',')
+
+ # continue build of ref_dict
+ if os_locus in ref_dict :
+ ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot
+ else :
+ ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot
+
+ # build list of os inp loci for set operations
+ set_inp_os_loci.add(os_locus)
+ INP.close()
+
+ #print set_ens_os_loci & set_inp_os_loci
+
+ # calculate common projected loci overlap sets
+ for k, v in ref_dict.iteritems() :
+ curr_comm_prj_loci = len(set(v[0]) & set(v[1]))
+ v.append(curr_comm_prj_loci)
+ total_common_prj += curr_comm_prj_loci
+ if curr_comm_prj_loci > 0 :
+ inc_common_prj += curr_comm_prj_loci
+
+ inc_os = len(set_ens_os_loci & set_inp_os_loci)
+ exc_ens_os = len(set_ens_os_loci - set_inp_os_loci)
+ exc_inp_os = len(set_inp_os_loci - set_ens_os_loci)
+
+ # calculate avg. common projected loci overlaps
+ avg_prj_overlap = float(total_common_prj) / float(len(ref_dict))
+ avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os)
+
+ #for k, v in ref_dict.iteritems() :
+ # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2])
+
+ return [{"Common Os loci:": str(inc_os)},
+ {"Exclusive Ensembl Os loci:": str(exc_ens_os)},
+ {"Exclusive Inparanoid Os loci:": str(exc_inp_os)},
+ {"Total common projected loci:": str(total_common_prj)},
+ {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
+ {"Avg. common projected loci:": str(avg_prj_overlap)}]
+ """
+#----------------------------------------------------------------------------------------------------------------------
+# main
+#----------------------------------------------------------------------------------------------------------------------
+
+# process args
+parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.')
+
+parser.add_argument('-e', '--ensembl_input_path', help='ensembl compara input file')
+parser.add_argument('-i', '--inparanoid_input_path', help='inparanoid supercluster input file')
+parser.add_argument('-m', '--rap_map_path', help='MSU-RAP mapping file')
+parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage')
+# TODO: add an "inparanoid super-cluster vs. conventional input" flag
+
+parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
+
+args = parser.parse_args()
+print args.accumulate(args.integers)
+
+# create projection maps
+ens_map = create_ens_map(args.ensembl_input_path, args.rap_map_path, args.reciprocal_id)
+inp_map = create_inp_map(args.inparanoid_input_path)
+
+# generate stats and output them
+compare_maps(ens_map, inp_map, comparison_file_path)
+
+++ /dev/null
-#!/bin/python
-"""
-Script with different analysis methods used to compare Ensembl and Inparanoid species projections.
-"""
-
-#----------------------------------------------------------------------------------------------------------------------
-# modules
-#----------------------------------------------------------------------------------------------------------------------
-import os
-import sys
-import argparse
-
-#----------------------------------------------------------------------------------------------------------------------
-# classes
-#----------------------------------------------------------------------------------------------------------------------
-class switch(object):
- def __init__(self, value):
- self.value = value
- self.fall = False
-
- def __iter__(self):
- """Return the match method once, then stop"""
- yield self.match
- raise StopIteration
-
- def match(self, *args):
- """Indicate whether or not to enter a case suite"""
- if self.fall or not args:
- return True
- elif self.value in args:
- self.fall = True
- return True
- else:
- return False
-
-#----------------------------------------------------------------------------------------------------------------------
-# functions
-#----------------------------------------------------------------------------------------------------------------------
-
-#----------------------------------------------------------------------------------------------------------------------
-def ens_map(input_file, output_file, recip_id) :
-#----------------------------------------------------------------------------------------------------------------------
- """
- open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where
- reciprocal identity is >= param% and confidence is high
- """
-
- count_loc = 0
- count_prj = 0
- dict_ens_ids = {}
- ENS = open(path + input_file)
- ENS.readline();
- for line in ENS :
- cols = line.rstrip().split()
- if len(cols) == 5 :
- if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence
- if cols[0] in dict_ens_ids :
- dict_ens_ids[cols[0]].append(cols[1])
- else :
- dict_ens_ids[cols[0]] = [cols[1]]
- ENS.close()
-
- #for k, v in dict_ens_ids.iteritems() :
- # print k, v
-
- #OS_2_PRJ_MAP = open(path + output_file,'w')
-
- RAP_IRGSP = open(path + "RAP-MSU.txt")
- RAP_IRGSP.readline();
- for line in RAP_IRGSP:
- if line.strip() != "" :
- cols = line.rstrip().split()
- rap_id = cols[0].upper()
- if rap_id in dict_ens_ids :
- print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id])
- #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n")
- #count_prj += len(set(dict_ens_ids[rap_id]))
- #count_loc += 1
- RAP_IRGSP.close()
- #OS_2_PRJ_MAP.close()
-
- return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
-
-#----------------------------------------------------------------------------------------------------------------------
-def inp_map(input_file, output_file) :
-#----------------------------------------------------------------------------------------------------------------------
- """
- open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci
- """
-
- count_loc = 0
- count_prj = 0
- dict_inp_ids = {}
- INP = open(path + input_file)
- INP.readline();
- for line in INP :
- cols = line.rstrip().split()
- # ignore non-canonical orthologs
- if int(cols[1].rsplit(".",1)[1]) > 1 :
- continue
- os_locus = cols[0].rstrip(".1")
- prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes
- if os_locus in dict_inp_ids :
- dict_inp_ids[os_locus].append(prj_locus)
- else :
- dict_inp_ids[os_locus] = [prj_locus]
- INP.close()
-
- OS_2_PRJ_MAP = open(path + output_file,'w')
- for k, v in sorted(dict_inp_ids.iteritems()) :
- #print k, v
- OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n")
- count_prj += len(set(v))
- count_loc += 1
- OS_2_PRJ_MAP.close()
-
- return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
-
-#----------------------------------------------------------------------------------------------------------------------
-def compare_maps(input_file, output_file) :
-#----------------------------------------------------------------------------------------------------------------------
- """compare Ensembl and Inparanoid projection results"""
-
- inc_os = 0 # Common Os loci
- exc_ens_os = 0 # Exclusive Ensembl Os loci
- exc_inp_os = 0 # Exclusive Inparanoid Os loci
- total_common_prj = 0 # Total Common projected loci
- inc_common_prj = 0 # Common projected loci where present in both Ens and Inp
- avg_prj_overlap = 0.0 # Avg. common projected loci
- avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp
-
- ref_dict = {}
- set_ens_os_loci = set()
- set_inp_os_loci = set()
-
- """
- ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]}
- """
- # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
-
- ENS = open(path + input_file.split(',')[0])
- for line in ENS :
- cols = line.rstrip().split()
- os_locus = cols[0]
- prj_loci = cols[1].split(',')
-
- # build ref_dict
- if os_locus in ref_dict :
- ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot
- else :
- ref_dict[os_locus] = [prj_loci, []]
-
- # build list of os ens loci for set operations
- set_ens_os_loci.add(os_locus)
- ENS.close()
-
- INP = open(path + input_file.split(',')[1])
- for line in INP :
- cols = line.rstrip().split()
- os_locus = cols[0]
- prj_loci = cols[1].split(',')
-
- # continue build of ref_dict
- if os_locus in ref_dict :
- ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot
- else :
- ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot
-
- # build list of os inp loci for set operations
- set_inp_os_loci.add(os_locus)
- INP.close()
-
- #print set_ens_os_loci & set_inp_os_loci
-
- # calculate common projected loci overlap sets
- for k, v in ref_dict.iteritems() :
- curr_comm_prj_loci = len(set(v[0]) & set(v[1]))
- v.append(curr_comm_prj_loci)
- total_common_prj += curr_comm_prj_loci
- if curr_comm_prj_loci > 0 :
- inc_common_prj += curr_comm_prj_loci
-
- inc_os = len(set_ens_os_loci & set_inp_os_loci)
- exc_ens_os = len(set_ens_os_loci - set_inp_os_loci)
- exc_inp_os = len(set_inp_os_loci - set_ens_os_loci)
-
- # calculate avg. common projected loci overlaps
- avg_prj_overlap = float(total_common_prj) / float(len(ref_dict))
- avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os)
-
- #for k, v in ref_dict.iteritems() :
- # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2])
-
- return [{"Common Os loci:": str(inc_os)},
- {"Exclusive Ensembl Os loci:": str(exc_ens_os)},
- {"Exclusive Inparanoid Os loci:": str(exc_inp_os)},
- {"Total common projected loci:": str(total_common_prj)},
- {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
- {"Avg. common projected loci:": str(avg_prj_overlap)}]
-
-#----------------------------------------------------------------------------------------------------------------------
-# main
-#----------------------------------------------------------------------------------------------------------------------
-
-# process args
-parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.')
-
-parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage')
-parser.add_argument('-e', '--ensembl_input', help='ensmbl input file')
-parser.add_argument('-i', '--inparanoid_input', help='inparanoid input file')
-parser.add_argument('-o', '--output', help='destination output file')
-
-args = parser.parse_args()
-print args.accumulate(args.integers)
-
-input_file =
-output_file =
-map_source_type =
-recip_id = sys.argv[4]
-
-
-list_counts = []
-
-for case in switch(map_source_type):
- if case("ens"):
- list_counts = ens_map(input_file, output_file, recip_id)
- break
- if case("inp"):
- list_counts = inp_map(input_file, output_file)
- break
- if case("compare"):
- list_counts = compare_maps(input_file, output_file) # input_file is actually two comma-del input files (ens 1st, inp 2nd)
- break
- if case(): # default, could also just omit condition or 'if True'
- print "Must provide a valid map source type: 'ens' (Ensembl), 'inp' (Inparanoid), or 'compare'"
-
-for item in list_counts :
- for k, v in item.iteritems():
- print k, v
-