def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
#----------------------------------------------------------------------------------------------------------------------
"""
- open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
+ open the inparanoid file (which is already loci-filtered for curated reference set and does not require RAP-to-MSU back-conversion) and generate a 2-col mapping of PRJ to LOC loci
"""
dict_inp_map = {} # local ensembl orthology dict
INP = open(inparanoid_input_path)
for line in INP :
cols = line.rstrip().split()
+ # for now, this next step is suspended. we are accepting all transcripts: isoformic, multiple sequencing lane IDs, etc.
# ignore non-canonical orthologs
- if int(cols[1][-1]) > 1 :
- continue
+ #if int(cols[1][-1]) > 1 :
+ # continue
os_locus = cols[0].rstrip("1").rstrip(".")
# swap loc for uniprot, if specified
if dict_uniprot_map :
os_locus = dict_uniprot_map[os_locus]
- prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
+ #prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0].rsplit("-",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#', '-#')
+ prj_locus = cols[1]
if os_locus in dict_inp_map :
dict_inp_map[os_locus].add(prj_locus)
else :
for line in ENS :
cols = line.rstrip().split()
if len(cols) == 5 :
- if cols[0] in dict_rap_map :
- os_locus = dict_rap_map[cols[0]]
- if os_locus in loci_filter :
- # swap loc for uniprot, if specified
- if dict_uniprot_map :
- os_locus = dict_uniprot_map[os_locus]
+ if dict_uniprot_map :
+ # if this ens os is already in uniprot list
+ if cols[0] in dict_uniprot_map :
+ # get uniprot id and build into ens_map
+ os_locus = dict_uniprot_map[cols[0]]
# reciprocal identity is >= recip_id%, optional high confidence
if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident :
if os_locus in dict_ens_map :
dict_ens_map[os_locus].add(cols[1])
else :
dict_ens_map[os_locus] = set([cols[1]])
+ else :
+ if cols[0] in dict_rap_map :
+ os_locus = dict_rap_map[cols[0]]
+ if os_locus in loci_filter :
+ # swap loc for uniprot
+ os_locus = dict_uniprot_map[os_locus]
+ # reciprocal identity is >= recip_id%, optional high confidence
+ if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident :
+ if os_locus in dict_ens_map :
+ dict_ens_map[os_locus].add(cols[1])
+ else :
+ dict_ens_map[os_locus] = set([cols[1]])
ENS.close()
return dict_ens_map
# TODO: add an "inparanoid super-cluster vs. conventional input" flag
# output settings
-parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projectied protein identifiers', default='') # e.g. 'MaizeGDB:'
+parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projected protein identifiers', default='') # e.g. 'MaizeGDB:'
parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs')
parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs')
# create projection maps
if (args.inparanoid_input_path) :
dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map)
-dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0)
+if (args.ensembl_input_path) :
+ dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0)
-# generate stats and output them
-if (args.inparanoid_input_path) :
+# generate stats and output them; assumes both inparanoid and ensembl data have been provided
+if (args.comparison_file_path) :
all_venn_data = compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path)
if args.generate_reactome_output == 'ensembl' :