major development complete; now testing w/ different projected species

author preecej <preecej@localhost>

Sat, 8 Mar 2014 01:07:05 +0000 (01:07 +0000)

committer preecej <preecej@localhost>

Sat, 8 Mar 2014 01:07:05 +0000 (01:07 +0000)
author preecej <preecej@localhost>
Sat, 8 Mar 2014 01:07:05 +0000 (01:07 +0000)
committer preecej <preecej@localhost>
Sat, 8 Mar 2014 01:07:05 +0000 (01:07 +0000)
diff --git a/Personnel/preecej/python_singletons/map_os_2_at.py b/Personnel/preecej/python_singletons/map_os_2_at.py

index 7a27776896cffdcb2ea972e6a16adfd83037c330..b784e8ed17de4febd8acca144b55612dff1bf4fb 100755 (executable)
--- a/Personnel/preecej/python_singletons/map_os_2_at.py
+++ b/Personnel/preecej/python_singletons/map_os_2_at.py
@@ -1,37 +1,226 @@
  #!/bin/python
+"""
+Script with different analysis methods used to compare Ensembl and Inparanoid species projections.
+"""
  
-# open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of AT to LOC loci where reciprocal identity is >= 50% and confidence is high 
-
-path = "/home/preecej/Documents/projects/plant_reactome/plant_reactome_site/projection/rice_to_arabidopsis/"
-
-dict_ens_ids = {}
-
-ENS = open(path + "ensembl_plants_40_os_2_at_uniq.tab")
-ENS.readline();
-for line in ENS :
-    cols = line.rstrip().split()
-    if len(cols) == 5 :
-        if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 :  # reciprocal identity is >= 50%, high confidence
-            if cols[0] in dict_ens_ids.keys() :
-                dict_ens_ids[cols[0]].append(cols[1])
-            else :
-                dict_ens_ids[cols[0]] = [cols[1]]
-ENS.close()
-
-#for k, v in dict_ens_ids.iteritems() :
-#    print k, v 
-
-OS_2_AT_MAP = open(path + "ensembl_ortho_os_2_at.tab",'w')
- 
-RAP_IRGSP = open(path + "loc_rap_mappings.txt")
-RAP_IRGSP.readline();
-for line in RAP_IRGSP:
-    if line.strip() != "" :
+#----------------------------------------------------------------------------------------------------------------------
+# modules
+#----------------------------------------------------------------------------------------------------------------------
+import os
+import sys
+
+#----------------------------------------------------------------------------------------------------------------------
+# globals
+#----------------------------------------------------------------------------------------------------------------------
+path = os.getcwd() + "/"
+input_file = sys.argv[1]
+output_file = sys.argv[2]
+map_source_type = sys.argv[3] 
+
+#----------------------------------------------------------------------------------------------------------------------
+# classes
+#----------------------------------------------------------------------------------------------------------------------
+class switch(object):
+    def __init__(self, value):
+        self.value = value
+        self.fall = False
+
+    def __iter__(self):
+        """Return the match method once, then stop"""
+        yield self.match
+        raise StopIteration
+    
+    def match(self, *args):
+        """Indicate whether or not to enter a case suite"""
+        if self.fall or not args:
+            return True
+        elif self.value in args:
+            self.fall = True
+            return True
+        else:
+            return False
+
+#----------------------------------------------------------------------------------------------------------------------
+# functions
+#----------------------------------------------------------------------------------------------------------------------
+
+#----------------------------------------------------------------------------------------------------------------------
+def ens_map(input_file, output_file) :
+#----------------------------------------------------------------------------------------------------------------------
+    """
+    open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where 
+    reciprocal identity is >= 50% and confidence is high
+    """
+    
+    count_loc = 0
+    count_prj = 0
+    dict_ens_ids = {}
+    ENS = open(path + input_file)
+    ENS.readline();
+    for line in ENS :
+        cols = line.rstrip().split()
+        if len(cols) == 5 :
+            if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 :  # reciprocal identity is >= 50%, high confidence
+                if cols[0] in dict_ens_ids.keys() :
+                    dict_ens_ids[cols[0]].append(cols[1])
+                else :
+                    dict_ens_ids[cols[0]] = [cols[1]]
+    ENS.close()
+
+    #for k, v in dict_ens_ids.iteritems() :
+    #    print k, v 
+
+    OS_2_PRJ_MAP = open(path + output_file,'w')
+     
+    RAP_IRGSP = open(path + "loc_rap_mappings_uniq.txt")
+    RAP_IRGSP.readline();
+    for line in RAP_IRGSP:
+        if line.strip() != "" :
+            cols = line.rstrip().split()
+            rap_id = cols[1].upper()
+            if rap_id in dict_ens_ids and rap_id != "NONE" :
+                #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id])
+                OS_2_PRJ_MAP.write(cols[0].upper() + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n")
+                count_prj += len(dict_ens_ids[rap_id])
+                count_loc += 1
+    RAP_IRGSP.close()
+    OS_2_PRJ_MAP.close()
+    
+    return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+        
+#----------------------------------------------------------------------------------------------------------------------
+def inp_map(input_file, output_file) :
+#----------------------------------------------------------------------------------------------------------------------
+    """
+    open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci
+    """ 
+    
+    count_loc = 0
+    count_prj = 0
+    dict_inp_ids = {}
+    INP = open(path + input_file)
+    INP.readline();
+    for line in INP :
+        cols = line.rstrip().split()
+        os_locus = cols[1].rstrip(".1")
+        prj_locus = cols[2].rstrip(".1")
+        if os_locus in dict_inp_ids.keys() :
+            dict_inp_ids[os_locus].append(prj_locus)
+        else :
+            dict_inp_ids[os_locus] = [prj_locus]
+    INP.close()
+
+    OS_2_PRJ_MAP = open(path + output_file,'w')
+    for k, v in sorted(dict_inp_ids.iteritems()) :
+        #print k, v 
+        OS_2_PRJ_MAP.write(k + "\t" + ",".join(v) + "\n")
+        count_prj += len(v)
+        count_loc += 1
+    OS_2_PRJ_MAP.close()
+
+    return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+
+#----------------------------------------------------------------------------------------------------------------------
+def compare_maps(input_file, output_file) :
+#----------------------------------------------------------------------------------------------------------------------
+    """compare Ensembl and Inparanoid projection results"""
+    
+    inc_os = 0              # Common Os loci
+    exc_ens_os = 0          # Exclusive Ensembl Os loci
+    exc_inp_os = 0          # Exclusive Inparanoid Os loci
+    total_common_prj = 0     # Total Common projected loci
+    inc_common_prj = 0       # Common projected loci where present in both Ens and Inp
+    avg_prj_overlap = 0.0    # Avg. common projected loci
+    avg_inc_prj_overlap = 0.0    # Avg. common projected loci present in both Ens and Inp
+
+    ref_dict = {}
+    set_ens_os_loci = set()
+    set_inp_os_loci = set()
+
+    """
+    ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]}
+    """
+    # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
+
+    ENS = open(path + input_file.split(',')[0])
+    for line in ENS :
          cols = line.rstrip().split()
-        rap_id = cols[1].upper()
-        if rap_id in dict_ens_ids and rap_id != "NONE" :
-            #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) 
-            OS_2_AT_MAP.write(cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n")
-RAP_IRGSP.close()
+        os_locus = cols[0]
+        prj_loci = cols[1].split(',')
+
+        # build ref_dict
+        if os_locus in ref_dict.keys() :
+            ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot
+        else :
+            ref_dict[os_locus] = [prj_loci, []]
+        
+        # build list of os ens loci for set operations
+        set_ens_os_loci.add(os_locus)
+    ENS.close()
+
+    INP = open(path + input_file.split(',')[1])
+    for line in INP :
+        cols = line.rstrip().split()
+        os_locus = cols[0]
+        prj_loci = cols[1].split(',')
+
+        # continue build of ref_dict
+        if os_locus in ref_dict.keys() :
+            ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot
+        else :
+            ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot
+
+        # build list of os inp loci for set operations
+        set_inp_os_loci.add(os_locus)
+    INP.close()
+
+    #print set_ens_os_loci & set_inp_os_loci
+
+    # calculate common projected loci overlap sets
+    for k, v in ref_dict.iteritems() :
+        curr_comm_prj_loci = len(set(v[0]) & set(v[1]))
+        v.append(curr_comm_prj_loci)
+        total_common_prj += curr_comm_prj_loci
+        if curr_comm_prj_loci > 0 :
+            inc_common_prj += curr_comm_prj_loci
+
+    inc_os = len(set_ens_os_loci & set_inp_os_loci)
+    exc_ens_os = len(set_ens_os_loci - set_inp_os_loci)
+    exc_inp_os = len(set_inp_os_loci - set_ens_os_loci)
+
+    # calculate avg. common projected loci overlaps    
+    avg_prj_overlap = float(total_common_prj) / float(len(ref_dict))
+    avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os)
+    
+    #for k, v in ref_dict.iteritems() :
+    #    print k + "\n    Ens: " + str(v[0]) + "\n    Inp: " + str(v[1]) + "\n    Common: " + str(v[2]) 
+    
+    return [{"Common Os loci:": str(inc_os)}, 
+        {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, 
+        {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, 
+        {"Total common projected loci:": str(total_common_prj)},
+        {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
+        {"Avg. common projected loci:": str(avg_prj_overlap)}]
+    
+#----------------------------------------------------------------------------------------------------------------------
+# main
+#----------------------------------------------------------------------------------------------------------------------
+list_counts = []
+
+for case in switch(map_source_type):
+    if case("ens"):
+        list_counts = ens_map(input_file, output_file)
+        break
+    if case("inp"):
+        list_counts = inp_map(input_file, output_file)
+        break
+    if case("compare"):
+        list_counts = compare_maps(input_file, output_file) # input_file is actually two comma-del input files (ens 1st, inp 2nd)
+        break
+    if case(): # default, could also just omit condition or 'if True'
+        print "Must provide a valid map source type: 'ens' (Ensembl), 'inp' (Inparanoid), or 'compare'"
+
+for item in list_counts :
+    for k, v in item.iteritems():
+        print k, v
  
-OS_2_AT_MAP.close()
author	preecej <preecej@localhost>
	Sat, 8 Mar 2014 01:07:05 +0000 (01:07 +0000)
committer	preecej <preecej@localhost>
	Sat, 8 Mar 2014 01:07:05 +0000 (01:07 +0000)