now handles direct RAP::UniProt mapping for Ensembl data, not just

author preecej <preecej@localhost>

Wed, 23 Mar 2016 01:28:28 +0000 (01:28 +0000)

committer preecej <preecej@localhost>

Wed, 23 Mar 2016 01:28:28 +0000 (01:28 +0000)
author preecej <preecej@localhost>
Wed, 23 Mar 2016 01:28:28 +0000 (01:28 +0000)
committer preecej <preecej@localhost>
Wed, 23 Mar 2016 01:28:28 +0000 (01:28 +0000)
diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py

index 75e485621b185ef30b1c3a5b6209ae42306f4eda..2d1ed3aecaf06e63177a70efe38ef0e149215cf0 100755 (executable)
--- a/Personnel/preecej/python_singletons/incomparanoid.py
+++ b/Personnel/preecej/python_singletons/incomparanoid.py
@@ -53,21 +53,23 @@ def create_dict_uniprot_map(uniprot_substitution_path) :
  def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
  #----------------------------------------------------------------------------------------------------------------------
      """
-    open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
+    open the inparanoid file (which is already loci-filtered for curated reference set and does not require RAP-to-MSU back-conversion) and generate a 2-col mapping of PRJ to LOC loci
      """ 
      dict_inp_map = {} # local ensembl orthology dict
  
      INP = open(inparanoid_input_path)
      for line in INP :
          cols = line.rstrip().split()
+        # for now, this next step is suspended. we are accepting all transcripts: isoformic, multiple sequencing lane IDs, etc.
          # ignore non-canonical orthologs
-        if int(cols[1][-1]) > 1 :
-            continue
+        #if int(cols[1][-1]) > 1 :
+        #    continue
          os_locus = cols[0].rstrip("1").rstrip(".")
          # swap loc for uniprot, if specified
          if dict_uniprot_map :
              os_locus = dict_uniprot_map[os_locus]
-        prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
+        #prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0].rsplit("-",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#', '-#')
+        prj_locus = cols[1]
          if os_locus in dict_inp_map :
              dict_inp_map[os_locus].add(prj_locus)
          else :
@@ -123,18 +125,29 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_
      for line in ENS :
          cols = line.rstrip().split()
          if len(cols) == 5 :
-            if cols[0] in dict_rap_map :
-                os_locus = dict_rap_map[cols[0]]
-                if os_locus in loci_filter :
-                    # swap loc for uniprot, if specified
-                    if dict_uniprot_map :
-                        os_locus = dict_uniprot_map[os_locus]
+            if dict_uniprot_map :
+                # if this ens os is already in uniprot list
+                if cols[0] in dict_uniprot_map :
+                    # get uniprot id and build into ens_map
+                    os_locus = dict_uniprot_map[cols[0]]
                      # reciprocal identity is >= recip_id%, optional high confidence
                      if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident :
                          if os_locus in dict_ens_map :
                              dict_ens_map[os_locus].add(cols[1])
                          else :
                              dict_ens_map[os_locus] = set([cols[1]])
+                else :
+                    if cols[0] in dict_rap_map :
+                        os_locus = dict_rap_map[cols[0]]
+                        if os_locus in loci_filter :
+                            # swap loc for uniprot
+                            os_locus = dict_uniprot_map[os_locus]
+                            # reciprocal identity is >= recip_id%, optional high confidence
+                            if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident :
+                                if os_locus in dict_ens_map :
+                                    dict_ens_map[os_locus].add(cols[1])
+                                else :
+                                    dict_ens_map[os_locus] = set([cols[1]])
      ENS.close()
      
      return dict_ens_map
@@ -387,7 +400,7 @@ parser.add_argument('--proj_species', help='projection species')
  # TODO: add an "inparanoid super-cluster vs. conventional input" flag
  
  # output settings
-parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projectied protein identifiers', default='') # e.g. 'MaizeGDB:'
+parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projected protein identifiers', default='') # e.g. 'MaizeGDB:'
  parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
  parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs')
  parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs')
@@ -407,10 +420,11 @@ if args.uniprot_substitution :
  # create projection maps
  if (args.inparanoid_input_path) :
      dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map)
-dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0)
+if (args.ensembl_input_path) :
+    dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0)
  
-# generate stats and output them
-if (args.inparanoid_input_path) :
+# generate stats and output them; assumes both inparanoid and ensembl data have been provided
+if (args.comparison_file_path) :
      all_venn_data = compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path)
  
  if args.generate_reactome_output == 'ensembl' :
author	preecej <preecej@localhost>
	Wed, 23 Mar 2016 01:28:28 +0000 (01:28 +0000)
committer	preecej <preecej@localhost>
	Wed, 23 Mar 2016 01:28:28 +0000 (01:28 +0000)