From 11f0b52e5dc8b11909500fb369d5af0fecfa54f7 Mon Sep 17 00:00:00 2001
From: preecej <preecej@localhost>
Date: Wed, 23 Mar 2016 01:28:28 +0000
Subject: [PATCH] now handles direct RAP::UniProt mapping for Ensembl data, not
 just RAP:MSU:UniProt. more inclusive

svn path=/; revision=644
---
 .../python_singletons/incomparanoid.py        | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py
index 75e4856..2d1ed3a 100755
--- a/Personnel/preecej/python_singletons/incomparanoid.py
+++ b/Personnel/preecej/python_singletons/incomparanoid.py
@@ -53,21 +53,23 @@ def create_dict_uniprot_map(uniprot_substitution_path) :
 def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
 #----------------------------------------------------------------------------------------------------------------------
     """
-    open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
+    open the inparanoid file (which is already loci-filtered for curated reference set and does not require RAP-to-MSU back-conversion) and generate a 2-col mapping of PRJ to LOC loci
     """ 
     dict_inp_map = {} # local ensembl orthology dict
 
     INP = open(inparanoid_input_path)
     for line in INP :
         cols = line.rstrip().split()
+        # for now, this next step is suspended. we are accepting all transcripts: isoformic, multiple sequencing lane IDs, etc.
         # ignore non-canonical orthologs
-        if int(cols[1][-1]) > 1 :
-            continue
+        #if int(cols[1][-1]) > 1 :
+        #    continue
         os_locus = cols[0].rstrip("1").rstrip(".")
         # swap loc for uniprot, if specified
         if dict_uniprot_map :
             os_locus = dict_uniprot_map[os_locus]
-        prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
+        #prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0].rsplit("-",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#', '-#')
+        prj_locus = cols[1]
         if os_locus in dict_inp_map :
             dict_inp_map[os_locus].add(prj_locus)
         else :
@@ -123,18 +125,29 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_
     for line in ENS :
         cols = line.rstrip().split()
         if len(cols) == 5 :
-            if cols[0] in dict_rap_map :
-                os_locus = dict_rap_map[cols[0]]
-                if os_locus in loci_filter :
-                    # swap loc for uniprot, if specified
-                    if dict_uniprot_map :
-                        os_locus = dict_uniprot_map[os_locus]
+            if dict_uniprot_map :
+                # if this ens os is already in uniprot list
+                if cols[0] in dict_uniprot_map :
+                    # get uniprot id and build into ens_map
+                    os_locus = dict_uniprot_map[cols[0]]
                     # reciprocal identity is >= recip_id%, optional high confidence
                     if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident :
                         if os_locus in dict_ens_map :
                             dict_ens_map[os_locus].add(cols[1])
                         else :
                             dict_ens_map[os_locus] = set([cols[1]])
+                else :
+                    if cols[0] in dict_rap_map :
+                        os_locus = dict_rap_map[cols[0]]
+                        if os_locus in loci_filter :
+                            # swap loc for uniprot
+                            os_locus = dict_uniprot_map[os_locus]
+                            # reciprocal identity is >= recip_id%, optional high confidence
+                            if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident :
+                                if os_locus in dict_ens_map :
+                                    dict_ens_map[os_locus].add(cols[1])
+                                else :
+                                    dict_ens_map[os_locus] = set([cols[1]])
     ENS.close()
     
     return dict_ens_map
@@ -387,7 +400,7 @@ parser.add_argument('--proj_species', help='projection species')
 # TODO: add an "inparanoid super-cluster vs. conventional input" flag
 
 # output settings
-parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projectied protein identifiers', default='') # e.g. 'MaizeGDB:'
+parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projected protein identifiers', default='') # e.g. 'MaizeGDB:'
 parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
 parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs')
 parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs')
@@ -407,10 +420,11 @@ if args.uniprot_substitution :
 # create projection maps
 if (args.inparanoid_input_path) :
     dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map)
-dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0)
+if (args.ensembl_input_path) :
+    dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0)
 
-# generate stats and output them
-if (args.inparanoid_input_path) :
+# generate stats and output them; assumes both inparanoid and ensembl data have been provided
+if (args.comparison_file_path) :
     all_venn_data = compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path)
 
 if args.generate_reactome_output == 'ensembl' :
-- 
2.34.1