dict_ens_map = {}
dict_inp_map = {}
-#----------------------------------------------------------------------------------------------------------------------
-# classes
-#----------------------------------------------------------------------------------------------------------------------
-class switch(object):
- def __init__(self, value):
- self.value = value
- self.fall = False
-
- def __iter__(self):
- """Return the match method once, then stop"""
- yield self.match
- raise StopIteration
-
- def match(self, *args):
- """Indicate whether or not to enter a case suite"""
- if self.fall or not args:
- return True
- elif self.value in args:
- self.fall = True
- return True
- else:
- return False
-
#----------------------------------------------------------------------------------------------------------------------
# functions
#----------------------------------------------------------------------------------------------------------------------
dict_uniprot_map[loc] = uniprot
UNI.close()
- #for k, v in sorted(dict_uniprot_map.iteritems()) :
- #print k, v
-
return dict_uniprot_map
+
#----------------------------------------------------------------------------------------------------------------------
def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
#----------------------------------------------------------------------------------------------------------------------
open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
"""
dict_inp_map = {} # local ensembl orthology dict
- count_loci = 0
- count_projections = 0
INP = open(inparanoid_input_path)
for line in INP :
else :
dict_inp_map[os_locus] = set([prj_locus])
INP.close()
-
- for k, v in sorted(dict_inp_map.iteritems()) :
- #print k, v
- count_projections += len(set(v))
- count_loci += 1
-
- list_stats.extend([{"[Inparanoid Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}])
return dict_inp_map
"""
dict_ens_map = {} # local ensembl orthology dict
dict_rap_map = {} # local MSU-RAP dict, w/ only filtered canonical LOC loci (orig. ".1")
- count_loci = 0
- count_projections = 0
# generate internal MSU-RAP map
RAP_MAP = open(rap_map_path)
break;
RAP_MAP.close()
- #for k, v in dict_rap_map.iteritems() :
- # print k, v
-
# generate ref loci filter
FILTER = open(filtering_loci_path)
loci_filter = set()
else :
dict_ens_map[os_locus] = set([cols[1]])
ENS.close()
-
- for k, v in dict_ens_map.iteritems() :
- #print k, v
- count_loci += 1
- count_projections += len(v)
-
- list_stats.extend([{"\n[Ensemble Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}])
return dict_ens_map
#----------------------------------------------------------------------------------------------------------------------
-def compare_maps(dict_ens_map, dict_inp_map, comparison_file_path, ensembl_output_path, inparanoid_output_path) :
+def compare_maps(dict_cmp_map, dict_inp_map, comparison_file_path, compara_output_path, inparanoid_output_path) :
#----------------------------------------------------------------------------------------------------------------------
"""compare Ensembl and Inparanoid projection results"""
- inc_os = 0 # Common Os loci
- exc_ens_os = 0 # Exclusive Ensembl Os loci
- exc_inp_os = 0 # Exclusive Inparanoid Os loci
- total_common_prj = 0 # Total Common projected loci
- inc_common_prj = 0 # Common projected loci where present in both Ens and Inp
- avg_prj_overlap = 0.0 # Avg. common projected loci
- avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp
+
+ # reference loci
+ inp_ref_loci = len(dict_inp_map.keys()) # inparanoid ref loci
+ cmp_ref_loci = len(dict_cmp_map.keys()) # compara ref loci
- ref_dict = {}
- set_ens_os_loci = set()
- set_inp_os_loci = set()
+ set_inp_ref_loci = set(dict_inp_map.keys())
+ set_cmp_ref_loci = set(dict_cmp_map.keys())
- #ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]}
+ inp_exc_ref_loci = len(set_inp_ref_loci - set_cmp_ref_loci) # inparanoid exclusive ref loci
+ cmp_exc_ref_loci = len(set_cmp_ref_loci - set_inp_ref_loci) # compara exclusive ref loci
+ union_ref_loci = len(set_inp_ref_loci | set_cmp_ref_loci) # all ref loci
+ intersection_ref_loci = len(set_inp_ref_loci & set_cmp_ref_loci) # all shared ref loci
- # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
+ inp_ref_loci_coverage = '{:.2%}'.format(inp_ref_loci/float(union_ref_loci))
+ cmp_ref_loci_coverage = '{:.2%}'.format(cmp_ref_loci/float(union_ref_loci))
- for os_locus, prj_loci in dict_ens_map.iteritems() :
+ """
+ projected loci (all non-unique, meaning recounted relative to each ref locus; in other words,
+ not a collapsed set of projections that would ignore 1-to-many ref loci projections
+ """
+ inp_prj_loci = sum([len(v) for v in dict_inp_map.itervalues()]) # total inparanoid projected loci
+ cmp_prj_loci = sum([len(v) for v in dict_cmp_map.itervalues()]) # total compara projected loci
- # build ref_dict
- if os_locus in ref_dict :
- ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot
+ # iterate over both map files and build an overlap map, for the purpose of counting inclusive and exclusive projections by locus
+ # ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]}
+ ref_dict = {}
+
+ # build ref_dict (Compara)
+ for ref_locus, prj_loci in dict_cmp_map.iteritems() :
+ if ref_locus in ref_dict :
+ ref_dict[ref_locus][0].extend(prj_loci) # add Compara projected loci in the first list slot
else :
- ref_dict[os_locus] = [prj_loci, []]
-
- # build list of os ens loci for set operations
- set_ens_os_loci.add(os_locus)
+ ref_dict[ref_locus] = [prj_loci, []]
- for os_locus, prj_loci in dict_inp_map.iteritems() :
-
- # continue build of ref_dict
- if os_locus in ref_dict :
- ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot
+ # continue build of ref_dict (Inparanoid)
+ for ref_locus, prj_loci in dict_inp_map.iteritems() :
+ if ref_locus in ref_dict :
+ ref_dict[ref_locus][1].extend(prj_loci) # put the Inparanoid projected loci in the second list slot
else :
- ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot
-
- # build list of os inp loci for set operations
- set_inp_os_loci.add(os_locus)
+ ref_dict[ref_locus] = [[], prj_loci] # create an empty slot for the Compara projected loci, put the Inparanoid projected loci in the second list slot
- #print set_ens_os_loci & set_inp_os_loci
+ inp_exc_prj_loci = 0 # inparanoid exclusive projected loci
+ cmp_exc_prj_loci = 0 # compara exclusive projected loci
+ union_prj_loci = 0 # all projections, duplicates removed only at the ref locus level
+ intersection_prj_loci = 0 # all overlapping projections
# calculate common projected loci overlap sets
for k, v in ref_dict.iteritems() :
- curr_comm_prj_loci = len(set(v[0]) & set(v[1]))
- v.append(curr_comm_prj_loci)
- total_common_prj += curr_comm_prj_loci
- if curr_comm_prj_loci > 0 :
- inc_common_prj += curr_comm_prj_loci
+ curr_intersection_prj_loci = len(set(v[0]) & set(v[1]))
+ v.append(curr_intersection_prj_loci)
+ intersection_prj_loci += curr_intersection_prj_loci
+
+ union_prj_loci += len(set(v[0]) | set(v[1]))
+ inp_exc_prj_loci += len(set(v[1]) - set(v[0]))
+ cmp_exc_prj_loci += len(set(v[0]) - set(v[1]))
- inc_os = len(set_ens_os_loci & set_inp_os_loci)
- exc_ens_os = len(set_ens_os_loci - set_inp_os_loci)
- exc_inp_os = len(set_inp_os_loci - set_ens_os_loci)
+ inp_prj_loci_coverage = '{:.2%}'.format(inp_prj_loci/float(union_prj_loci))
+ cmp_prj_loci_coverage = '{:.2%}'.format(cmp_prj_loci/float(union_prj_loci))
+
+ # other stats
+ #avg_projections_per_locus
+ #avg_overlapping_projections_per_locus
# calculate avg. common projected loci overlaps
- avg_prj_overlap = float(total_common_prj) / float(len(ref_dict))
- avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os)
-
- #for k, v in ref_dict.iteritems() :
- # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2])
-
- list_stats.extend([{"\n[Comparison Stats]": ""},
- {"Common Os loci:": str(inc_os)},
- {"Exclusive Ensembl Os loci:": str(exc_ens_os)},
- {"Exclusive Inparanoid Os loci:": str(exc_inp_os)},
- {"Total Os loci:": str(inc_os + exc_ens_os + exc_inp_os)},
- {"Percentage Reference Loci Coverage (Ensemble):": str((float(inc_os + exc_ens_os)/float(inc_os + exc_ens_os + exc_inp_os)*100))},
- {"":""},
- {"Common projected loci:": str(total_common_prj)},
- {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
- {"Avg. common projected loci:": str(avg_prj_overlap)}])
-
- COMP_STATS = open(comparison_file_path, 'w')
+ #avg_prj_overlap = float(total_intersection_projections) / float(len(ref_dict))
+ #avg_inc_prj_overlap = float(total_union_projections) / float(inc_os)
+
+ list_stats.extend([{"[Comparison Stats]": ""},
+
+ {"\n-- Reference Loci --":""},
+ {"Inparanoid reference loci:": str(inp_ref_loci)},
+ {"Compara reference loci:": str(cmp_ref_loci)},
+ {"Exclusive Inparanoid reference loci:": str(inp_exc_ref_loci)},
+ {"Exclusive Compara reference loci:": str(cmp_exc_ref_loci)},
+ {"All reference loci (union):": str(union_ref_loci)},
+ {"All shared reference loci (intersection):": str(intersection_ref_loci)},
+ {"Inparanoid reference loci coverage:": inp_ref_loci_coverage},
+ {"Compara reference loci coverage:": cmp_ref_loci_coverage},
+
+ {"\n-- Projected Loci (all non-unique with respect to reference loci) --":""},
+ {"Inparanoid projected loci:": str(inp_prj_loci)},
+ {"Compara projected loci:": str(cmp_prj_loci)},
+ {"Inparanoid exclusive projected loci:": str(inp_exc_prj_loci)},
+ {"Compara exclusive projected loci:": str(cmp_exc_prj_loci)},
+ {"All projected loci (union):": str(union_prj_loci)},
+ {"All shared projected loci (intersection):": str(intersection_prj_loci)},
+ {"Inparanoid projected loci coverage:": inp_prj_loci_coverage},
+ {"Compara projected loci coverage:": cmp_prj_loci_coverage},
+
+ ])
+
for item in list_stats :
for k, v in item.iteritems() :
- COMP_STATS.write(k + " " + v + "\n")
- COMP_STATS.close()
-
- INP_OUT_FILE = open(inparanoid_output_path,'w')
- INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w')
- for k, v in sorted(dict_inp_map.iteritems()) :
- INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
- for projection in sorted(v) :
- INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
- INP_OUT_FILE.close()
- INP_FLAT_OUT_FILE.close()
-
- ENS_OUT_FILE = open(ensembl_output_path,'w')
- ENS_FLAT_OUT_FILE = open(ensembl_output_path + ".flat",'w')
- for k, v in sorted(dict_ens_map.iteritems()) :
- ENS_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
- for projection in sorted(v) :
- ENS_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
- ENS_OUT_FILE.close()
- ENS_FLAT_OUT_FILE.close()
+ print(k + " " + v)
+
+ if comparison_file_path :
+ COMP_STATS = open(comparison_file_path, 'w')
+ for item in list_stats :
+ for k, v in item.iteritems() :
+ COMP_STATS.write(k + " " + v + "\n")
+ COMP_STATS.close()
+
+ if inparanoid_output_path :
+ INP_OUT_FILE = open(inparanoid_output_path,'w')
+ INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w')
+ for k, v in sorted(dict_inp_map.iteritems()) :
+ INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
+ for projection in sorted(v) :
+ INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
+ INP_OUT_FILE.close()
+ INP_FLAT_OUT_FILE.close()
+
+ if compara_output_path :
+ CMP_OUT_FILE = open(compara_output_path,'w')
+ CMP_FLAT_OUT_FILE = open(compara_output_path + ".flat",'w')
+ for k, v in sorted(dict_cmp_map.iteritems()) :
+ CMP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
+ for projection in sorted(v) :
+ CMP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
+ CMP_OUT_FILE.close()
+ CMP_FLAT_OUT_FILE.close()
+
#----------------------------------------------------------------------------------------------------------------------
def write_reactome_files(dict_map, reactome_gene_protein_path, reactome_projection_path, projection_prefix) :