#!/bin/python
+"""
+Script with different analysis methods used to compare Ensembl and Inparanoid species projections.
+"""
-# open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of AT to LOC loci where reciprocal identity is >= 50% and confidence is high
-
-path = "/home/preecej/Documents/projects/plant_reactome/plant_reactome_site/projection/rice_to_arabidopsis/"
-
-dict_ens_ids = {}
-
-ENS = open(path + "ensembl_plants_40_os_2_at_uniq.tab")
-ENS.readline();
-for line in ENS :
- cols = line.rstrip().split()
- if len(cols) == 5 :
- if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 : # reciprocal identity is >= 50%, high confidence
- if cols[0] in dict_ens_ids.keys() :
- dict_ens_ids[cols[0]].append(cols[1])
- else :
- dict_ens_ids[cols[0]] = [cols[1]]
-ENS.close()
-
-#for k, v in dict_ens_ids.iteritems() :
-# print k, v
-
-OS_2_AT_MAP = open(path + "ensembl_ortho_os_2_at.tab",'w')
-
-RAP_IRGSP = open(path + "loc_rap_mappings.txt")
-RAP_IRGSP.readline();
-for line in RAP_IRGSP:
- if line.strip() != "" :
+#----------------------------------------------------------------------------------------------------------------------
+# modules
+#----------------------------------------------------------------------------------------------------------------------
+import os
+import sys
+
+#----------------------------------------------------------------------------------------------------------------------
+# globals
+#----------------------------------------------------------------------------------------------------------------------
+path = os.getcwd() + "/"
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+map_source_type = sys.argv[3]
+
+#----------------------------------------------------------------------------------------------------------------------
+# classes
+#----------------------------------------------------------------------------------------------------------------------
+class switch(object):
+ def __init__(self, value):
+ self.value = value
+ self.fall = False
+
+ def __iter__(self):
+ """Return the match method once, then stop"""
+ yield self.match
+ raise StopIteration
+
+ def match(self, *args):
+ """Indicate whether or not to enter a case suite"""
+ if self.fall or not args:
+ return True
+ elif self.value in args:
+ self.fall = True
+ return True
+ else:
+ return False
+
+#----------------------------------------------------------------------------------------------------------------------
+# functions
+#----------------------------------------------------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------------------------------------------------
+def ens_map(input_file, output_file) :
+#----------------------------------------------------------------------------------------------------------------------
+ """
+ open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where
+ reciprocal identity is >= 50% and confidence is high
+ """
+
+ count_loc = 0
+ count_prj = 0
+ dict_ens_ids = {}
+ ENS = open(path + input_file)
+ ENS.readline();
+ for line in ENS :
+ cols = line.rstrip().split()
+ if len(cols) == 5 :
+ if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 : # reciprocal identity is >= 50%, high confidence
+ if cols[0] in dict_ens_ids.keys() :
+ dict_ens_ids[cols[0]].append(cols[1])
+ else :
+ dict_ens_ids[cols[0]] = [cols[1]]
+ ENS.close()
+
+ #for k, v in dict_ens_ids.iteritems() :
+ # print k, v
+
+ OS_2_PRJ_MAP = open(path + output_file,'w')
+
+ RAP_IRGSP = open(path + "loc_rap_mappings_uniq.txt")
+ RAP_IRGSP.readline();
+ for line in RAP_IRGSP:
+ if line.strip() != "" :
+ cols = line.rstrip().split()
+ rap_id = cols[1].upper()
+ if rap_id in dict_ens_ids and rap_id != "NONE" :
+ #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id])
+ OS_2_PRJ_MAP.write(cols[0].upper() + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n")
+ count_prj += len(dict_ens_ids[rap_id])
+ count_loc += 1
+ RAP_IRGSP.close()
+ OS_2_PRJ_MAP.close()
+
+ return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+
+#----------------------------------------------------------------------------------------------------------------------
+def inp_map(input_file, output_file) :
+#----------------------------------------------------------------------------------------------------------------------
+ """
+ open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci
+ """
+
+ count_loc = 0
+ count_prj = 0
+ dict_inp_ids = {}
+ INP = open(path + input_file)
+ INP.readline();
+ for line in INP :
+ cols = line.rstrip().split()
+ os_locus = cols[1].rstrip(".1")
+ prj_locus = cols[2].rstrip(".1")
+ if os_locus in dict_inp_ids.keys() :
+ dict_inp_ids[os_locus].append(prj_locus)
+ else :
+ dict_inp_ids[os_locus] = [prj_locus]
+ INP.close()
+
+ OS_2_PRJ_MAP = open(path + output_file,'w')
+ for k, v in sorted(dict_inp_ids.iteritems()) :
+ #print k, v
+ OS_2_PRJ_MAP.write(k + "\t" + ",".join(v) + "\n")
+ count_prj += len(v)
+ count_loc += 1
+ OS_2_PRJ_MAP.close()
+
+ return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+
+#----------------------------------------------------------------------------------------------------------------------
+def compare_maps(input_file, output_file) :
+#----------------------------------------------------------------------------------------------------------------------
+ """compare Ensembl and Inparanoid projection results"""
+
+ inc_os = 0 # Common Os loci
+ exc_ens_os = 0 # Exclusive Ensembl Os loci
+ exc_inp_os = 0 # Exclusive Inparanoid Os loci
+ total_common_prj = 0 # Total Common projected loci
+ inc_common_prj = 0 # Common projected loci where present in both Ens and Inp
+ avg_prj_overlap = 0.0 # Avg. common projected loci
+ avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp
+
+ ref_dict = {}
+ set_ens_os_loci = set()
+ set_inp_os_loci = set()
+
+ """
+ ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]}
+ """
+ # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
+
+ ENS = open(path + input_file.split(',')[0])
+ for line in ENS :
cols = line.rstrip().split()
- rap_id = cols[1].upper()
- if rap_id in dict_ens_ids and rap_id != "NONE" :
- #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id])
- OS_2_AT_MAP.write(cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n")
-RAP_IRGSP.close()
+ os_locus = cols[0]
+ prj_loci = cols[1].split(',')
+
+ # build ref_dict
+ if os_locus in ref_dict.keys() :
+ ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot
+ else :
+ ref_dict[os_locus] = [prj_loci, []]
+
+ # build list of os ens loci for set operations
+ set_ens_os_loci.add(os_locus)
+ ENS.close()
+
+ INP = open(path + input_file.split(',')[1])
+ for line in INP :
+ cols = line.rstrip().split()
+ os_locus = cols[0]
+ prj_loci = cols[1].split(',')
+
+ # continue build of ref_dict
+ if os_locus in ref_dict.keys() :
+ ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot
+ else :
+ ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot
+
+ # build list of os inp loci for set operations
+ set_inp_os_loci.add(os_locus)
+ INP.close()
+
+ #print set_ens_os_loci & set_inp_os_loci
+
+ # calculate common projected loci overlap sets
+ for k, v in ref_dict.iteritems() :
+ curr_comm_prj_loci = len(set(v[0]) & set(v[1]))
+ v.append(curr_comm_prj_loci)
+ total_common_prj += curr_comm_prj_loci
+ if curr_comm_prj_loci > 0 :
+ inc_common_prj += curr_comm_prj_loci
+
+ inc_os = len(set_ens_os_loci & set_inp_os_loci)
+ exc_ens_os = len(set_ens_os_loci - set_inp_os_loci)
+ exc_inp_os = len(set_inp_os_loci - set_ens_os_loci)
+
+ # calculate avg. common projected loci overlaps
+ avg_prj_overlap = float(total_common_prj) / float(len(ref_dict))
+ avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os)
+
+ #for k, v in ref_dict.iteritems() :
+ # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2])
+
+ return [{"Common Os loci:": str(inc_os)},
+ {"Exclusive Ensembl Os loci:": str(exc_ens_os)},
+ {"Exclusive Inparanoid Os loci:": str(exc_inp_os)},
+ {"Total common projected loci:": str(total_common_prj)},
+ {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
+ {"Avg. common projected loci:": str(avg_prj_overlap)}]
+
+#----------------------------------------------------------------------------------------------------------------------
+# main
+#----------------------------------------------------------------------------------------------------------------------
+list_counts = []
+
+for case in switch(map_source_type):
+ if case("ens"):
+ list_counts = ens_map(input_file, output_file)
+ break
+ if case("inp"):
+ list_counts = inp_map(input_file, output_file)
+ break
+ if case("compare"):
+ list_counts = compare_maps(input_file, output_file) # input_file is actually two comma-del input files (ens 1st, inp 2nd)
+ break
+ if case(): # default, could also just omit condition or 'if True'
+ print "Must provide a valid map source type: 'ens' (Ensembl), 'inp' (Inparanoid), or 'compare'"
+
+for item in list_counts :
+ for k, v in item.iteritems():
+ print k, v
-OS_2_AT_MAP.close()