From: preecej <preecej@localhost>
Date: Sat, 5 Apr 2014 01:41:47 +0000 (+0000)
Subject: complete refactor of projection comparison code; added more projection
X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=78aace236712df95005368765b7c89714d34760e;p=old-jaiswallab-svn%2F.git

complete refactor of projection comparison code; added more projection
stats

svn path=/; revision=569
---

diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py
index 7841bd3..82d620d 100755
--- a/Personnel/preecej/python_singletons/incomparanoid.py
+++ b/Personnel/preecej/python_singletons/incomparanoid.py
@@ -19,29 +19,6 @@ dict_uniprot_map = {}
 dict_ens_map = {}
 dict_inp_map = {}
 
-#----------------------------------------------------------------------------------------------------------------------
-# classes
-#----------------------------------------------------------------------------------------------------------------------
-class switch(object):
-    def __init__(self, value):
-        self.value = value
-        self.fall = False
-
-    def __iter__(self):
-        """Return the match method once, then stop"""
-        yield self.match
-        raise StopIteration
-    
-    def match(self, *args):
-        """Indicate whether or not to enter a case suite"""
-        if self.fall or not args:
-            return True
-        elif self.value in args:
-            self.fall = True
-            return True
-        else:
-            return False
-
 #----------------------------------------------------------------------------------------------------------------------
 # functions
 #----------------------------------------------------------------------------------------------------------------------
@@ -65,11 +42,9 @@ def create_dict_uniprot_map(uniprot_substitution_path) :
             dict_uniprot_map[loc] = uniprot
     UNI.close()
     
-    #for k, v in sorted(dict_uniprot_map.iteritems()) :
-        #print k, v 
-    
     return dict_uniprot_map
 
+
 #----------------------------------------------------------------------------------------------------------------------
 def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
 #----------------------------------------------------------------------------------------------------------------------
@@ -77,8 +52,6 @@ def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
     open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
     """ 
     dict_inp_map = {} # local ensembl orthology dict
-    count_loci = 0
-    count_projections = 0
 
     INP = open(inparanoid_input_path)
     for line in INP :
@@ -96,13 +69,6 @@ def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
         else :
             dict_inp_map[os_locus] = set([prj_locus])
     INP.close()
-
-    for k, v in sorted(dict_inp_map.iteritems()) :
-        #print k, v 
-        count_projections += len(set(v))
-        count_loci += 1
-
-    list_stats.extend([{"[Inparanoid Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}])
     
     return dict_inp_map
 
@@ -117,8 +83,6 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_
     """
     dict_ens_map = {} # local ensembl orthology dict
     dict_rap_map = {} # local MSU-RAP dict, w/ only filtered canonical LOC loci (orig. ".1")
-    count_loci = 0
-    count_projections = 0
 
     # generate internal MSU-RAP map    
     RAP_MAP = open(rap_map_path)
@@ -136,9 +100,6 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_
                     break;                    
     RAP_MAP.close()
 
-    #for k, v in dict_rap_map.iteritems() :
-    #    print k, v 
-
     # generate ref loci filter 
     FILTER = open(filtering_loci_path)
     loci_filter = set()
@@ -165,114 +126,136 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_
                         else :
                             dict_ens_map[os_locus] = set([cols[1]])
     ENS.close()
-
-    for k, v in dict_ens_map.iteritems() :
-        #print k, v
-        count_loci += 1 
-        count_projections += len(v)
-
-    list_stats.extend([{"\n[Ensemble Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}])
     
     return dict_ens_map
 
 
 #----------------------------------------------------------------------------------------------------------------------
-def compare_maps(dict_ens_map, dict_inp_map, comparison_file_path, ensembl_output_path, inparanoid_output_path) :
+def compare_maps(dict_cmp_map, dict_inp_map, comparison_file_path, compara_output_path, inparanoid_output_path) :
 #----------------------------------------------------------------------------------------------------------------------
     """compare Ensembl and Inparanoid projection results"""
-    inc_os = 0              # Common Os loci
-    exc_ens_os = 0          # Exclusive Ensembl Os loci
-    exc_inp_os = 0          # Exclusive Inparanoid Os loci
-    total_common_prj = 0     # Total Common projected loci
-    inc_common_prj = 0       # Common projected loci where present in both Ens and Inp
-    avg_prj_overlap = 0.0    # Avg. common projected loci
-    avg_inc_prj_overlap = 0.0    # Avg. common projected loci present in both Ens and Inp
+    
+    # reference loci
+    inp_ref_loci = len(dict_inp_map.keys()) # inparanoid ref loci
+    cmp_ref_loci = len(dict_cmp_map.keys()) # compara ref loci
 
-    ref_dict = {}
-    set_ens_os_loci = set()
-    set_inp_os_loci = set()
+    set_inp_ref_loci = set(dict_inp_map.keys())
+    set_cmp_ref_loci = set(dict_cmp_map.keys())
     
-    #ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]}
+    inp_exc_ref_loci = len(set_inp_ref_loci - set_cmp_ref_loci)         # inparanoid exclusive ref loci
+    cmp_exc_ref_loci = len(set_cmp_ref_loci - set_inp_ref_loci)         # compara exclusive ref loci
+    union_ref_loci = len(set_inp_ref_loci | set_cmp_ref_loci)           # all ref loci 
+    intersection_ref_loci = len(set_inp_ref_loci & set_cmp_ref_loci)    # all shared ref loci
 
-    # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
+    inp_ref_loci_coverage = '{:.2%}'.format(inp_ref_loci/float(union_ref_loci))
+    cmp_ref_loci_coverage = '{:.2%}'.format(cmp_ref_loci/float(union_ref_loci))
 
-    for os_locus, prj_loci in dict_ens_map.iteritems() :
+    """
+    projected loci (all non-unique, meaning recounted relative to each ref locus; in other words, 
+    not a collapsed set of projections that would ignore 1-to-many ref loci projections
+    """
+    inp_prj_loci = sum([len(v) for v in dict_inp_map.itervalues()]) # total inparanoid projected loci
+    cmp_prj_loci = sum([len(v) for v in dict_cmp_map.itervalues()]) # total compara projected loci
 
-        # build ref_dict
-        if os_locus in ref_dict :
-            ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot
+    # iterate over both map files and build an overlap map, for the purpose of counting inclusive and exclusive projections by locus
+    # ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]}
+    ref_dict = {}
+    
+    # build ref_dict (Compara)
+    for ref_locus, prj_loci in dict_cmp_map.iteritems() :
+        if ref_locus in ref_dict :
+            ref_dict[ref_locus][0].extend(prj_loci) # add Compara projected loci in the first list slot
         else :
-            ref_dict[os_locus] = [prj_loci, []]
-        
-        # build list of os ens loci for set operations
-        set_ens_os_loci.add(os_locus)
+            ref_dict[ref_locus] = [prj_loci, []]
 
-    for os_locus, prj_loci in dict_inp_map.iteritems() :
-
-        # continue build of ref_dict
-        if os_locus in ref_dict :
-            ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot
+    # continue build of ref_dict (Inparanoid)
+    for ref_locus, prj_loci in dict_inp_map.iteritems() :
+        if ref_locus in ref_dict :
+            ref_dict[ref_locus][1].extend(prj_loci) # put the Inparanoid projected loci in the second list slot
         else :
-            ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot
-
-        # build list of os inp loci for set operations
-        set_inp_os_loci.add(os_locus)
+            ref_dict[ref_locus] = [[], prj_loci] # create an empty slot for the Compara projected loci, put the Inparanoid projected loci in the second list slot
 
-    #print set_ens_os_loci & set_inp_os_loci
+    inp_exc_prj_loci = 0        # inparanoid exclusive projected loci
+    cmp_exc_prj_loci = 0        # compara exclusive projected loci
+    union_prj_loci = 0          # all projections, duplicates removed only at the ref locus level
+    intersection_prj_loci = 0   # all overlapping projections 
 
     # calculate common projected loci overlap sets
     for k, v in ref_dict.iteritems() :
-        curr_comm_prj_loci = len(set(v[0]) & set(v[1]))
-        v.append(curr_comm_prj_loci)
-        total_common_prj += curr_comm_prj_loci
-        if curr_comm_prj_loci > 0 :
-            inc_common_prj += curr_comm_prj_loci
+        curr_intersection_prj_loci = len(set(v[0]) & set(v[1]))
+        v.append(curr_intersection_prj_loci)
+        intersection_prj_loci += curr_intersection_prj_loci
+
+        union_prj_loci += len(set(v[0]) | set(v[1]))
+        inp_exc_prj_loci += len(set(v[1]) - set(v[0]))
+        cmp_exc_prj_loci += len(set(v[0]) - set(v[1]))
 
-    inc_os = len(set_ens_os_loci & set_inp_os_loci)
-    exc_ens_os = len(set_ens_os_loci - set_inp_os_loci)
-    exc_inp_os = len(set_inp_os_loci - set_ens_os_loci)
+    inp_prj_loci_coverage = '{:.2%}'.format(inp_prj_loci/float(union_prj_loci))
+    cmp_prj_loci_coverage = '{:.2%}'.format(cmp_prj_loci/float(union_prj_loci))
+
+    # other stats
+    #avg_projections_per_locus
+    #avg_overlapping_projections_per_locus
 
     # calculate avg. common projected loci overlaps    
-    avg_prj_overlap = float(total_common_prj) / float(len(ref_dict))
-    avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os)
-    
-    #for k, v in ref_dict.iteritems() :
-    #    print k + "\n    Ens: " + str(v[0]) + "\n    Inp: " + str(v[1]) + "\n    Common: " + str(v[2]) 
-    
-    list_stats.extend([{"\n[Comparison Stats]": ""}, 
-        {"Common Os loci:": str(inc_os)}, 
-        {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, 
-        {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, 
-        {"Total Os loci:": str(inc_os + exc_ens_os + exc_inp_os)}, 
-        {"Percentage Reference Loci Coverage (Ensemble):": str((float(inc_os + exc_ens_os)/float(inc_os + exc_ens_os + exc_inp_os)*100))}, 
-        {"":""}, 
-        {"Common projected loci:": str(total_common_prj)},
-	    {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
-	    {"Avg. common projected loci:": str(avg_prj_overlap)}])
-
-    COMP_STATS = open(comparison_file_path, 'w')
+    #avg_prj_overlap = float(total_intersection_projections) / float(len(ref_dict))
+    #avg_inc_prj_overlap = float(total_union_projections) / float(inc_os)
+
+    list_stats.extend([{"[Comparison Stats]": ""},
+            
+        {"\n-- Reference Loci --":""}, 
+        {"Inparanoid reference loci:": str(inp_ref_loci)}, 
+        {"Compara reference loci:": str(cmp_ref_loci)}, 
+        {"Exclusive Inparanoid reference loci:": str(inp_exc_ref_loci)}, 
+        {"Exclusive Compara reference loci:": str(cmp_exc_ref_loci)}, 
+        {"All reference loci (union):": str(union_ref_loci)},
+        {"All shared reference loci (intersection):": str(intersection_ref_loci)},
+        {"Inparanoid reference loci coverage:": inp_ref_loci_coverage},
+        {"Compara reference loci coverage:": cmp_ref_loci_coverage},
+        
+        {"\n-- Projected Loci (all non-unique with respect to reference loci) --":""}, 
+        {"Inparanoid projected loci:": str(inp_prj_loci)},
+        {"Compara projected loci:": str(cmp_prj_loci)},
+        {"Inparanoid exclusive projected loci:": str(inp_exc_prj_loci)},
+        {"Compara exclusive projected loci:": str(cmp_exc_prj_loci)},
+        {"All projected loci (union):": str(union_prj_loci)},
+        {"All shared projected loci (intersection):": str(intersection_prj_loci)},
+        {"Inparanoid projected loci coverage:": inp_prj_loci_coverage},
+        {"Compara projected loci coverage:": cmp_prj_loci_coverage},
+
+    ])     
+
     for item in list_stats :
         for k, v in item.iteritems() :
-            COMP_STATS.write(k + " " + v + "\n")
-    COMP_STATS.close()
-
-    INP_OUT_FILE = open(inparanoid_output_path,'w')
-    INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w')
-    for k, v in sorted(dict_inp_map.iteritems()) :
-        INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
-        for projection in sorted(v) :
-            INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
-    INP_OUT_FILE.close()
-    INP_FLAT_OUT_FILE.close()
-
-    ENS_OUT_FILE = open(ensembl_output_path,'w')
-    ENS_FLAT_OUT_FILE = open(ensembl_output_path + ".flat",'w')
-    for k, v in sorted(dict_ens_map.iteritems()) :
-        ENS_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
-        for projection in sorted(v) :
-            ENS_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
-    ENS_OUT_FILE.close()
-    ENS_FLAT_OUT_FILE.close()
+            print(k + " " + v)
+
+    if comparison_file_path :
+        COMP_STATS = open(comparison_file_path, 'w')
+        for item in list_stats :
+            for k, v in item.iteritems() :
+                COMP_STATS.write(k + " " + v + "\n")
+        COMP_STATS.close()
+
+    if inparanoid_output_path :
+        INP_OUT_FILE = open(inparanoid_output_path,'w')
+        INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w')
+        for k, v in sorted(dict_inp_map.iteritems()) :
+            INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
+            for projection in sorted(v) :
+                INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
+        INP_OUT_FILE.close()
+        INP_FLAT_OUT_FILE.close()
+
+    if compara_output_path :
+        CMP_OUT_FILE = open(compara_output_path,'w')
+        CMP_FLAT_OUT_FILE = open(compara_output_path + ".flat",'w')
+        for k, v in sorted(dict_cmp_map.iteritems()) :
+            CMP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n")
+            for projection in sorted(v) :
+                CMP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
+        CMP_OUT_FILE.close()
+        CMP_FLAT_OUT_FILE.close()
+
 
 #----------------------------------------------------------------------------------------------------------------------
 def write_reactome_files(dict_map, reactome_gene_protein_path, reactome_projection_path, projection_prefix) :