return dict_uniprot_map
#----------------------------------------------------------------------------------------------------------------------
-def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
+def create_inp_map(inparanoid_input_path, dict_uniprot_map, projection_prefix) :
#----------------------------------------------------------------------------------------------------------------------
"""
open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
# swap loc for uniprot, if specified
if dict_uniprot_map :
os_locus = dict_uniprot_map[os_locus]
- prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
+ prj_locus = (projection_prefix if projection_prefix else "") + cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
if os_locus in dict_inp_map :
dict_inp_map[os_locus].add(prj_locus)
else :
#----------------------------------------------------------------------------------------------------------------------
-def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id, dict_uniprot_map) :
+def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id, dict_uniprot_map, projection_prefix) :
#----------------------------------------------------------------------------------------------------------------------
"""
open the ensemble plants and rap::irgsp mapping files and generate a hash mapping of reference to projected loci where
# reciprocal identity is >= recip_id%, high confidence
if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 :
if os_locus in dict_ens_map :
- dict_ens_map[os_locus].add(cols[1])
+ dict_ens_map[os_locus].add((projection_prefix if projection_prefix + ":" else "") + cols[1])
else :
- dict_ens_map[os_locus] = set([cols[1]])
+ dict_ens_map[os_locus] = set([(projection_prefix if projection_prefix else "") + cols[1]])
ENS.close()
for k, v in dict_ens_map.iteritems() :
# TODO: add an "inparanoid super-cluster vs. conventional input" flag
# output settings
+parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projectied protein identifiers', default='')
parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs')
parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs')
dict_uniprot_map = create_dict_uniprot_map(args.uniprot_substitution)
# create projection maps
-dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map)
-dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map)
+dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map, args.projection_prefix)
+dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, args.projection_prefix)
# generate stats and output them
compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path)