#----------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------
-def create_dict_uniprot_map() :
+def create_dict_uniprot_map(uniprot_substitution_path) :
#----------------------------------------------------------------------------------------------------------------------
"""
create reference-to-uniprot mapping dictionary
"""
dict_uniprot_map = {} # local
- # read map file, populate dict (note possibility of one uniprot id to many ref loci)
+ # read map file, populate dict (it is possible to have a many-to-1 LOC-to-Uniprot relationship; this is ok for projection inference)
+ UNI = open(uniprot_substitution_path)
+ for line in UNI :
+ cols = line.rstrip().split()
+ loc = cols[0]
+ uniprot = cols[1]
+ if loc in dict_uniprot_map :
+ dict_uniprot_map[loc].add(uniprot)
+ else :
+ dict_uniprot_map[loc] = uniprot
+ UNI.close()
+
+ #for k, v in sorted(dict_uniprot_map.iteritems()) :
+ #print k, v
return dict_uniprot_map
#----------------------------------------------------------------------------------------------------------------------
-def create_inp_map(inparanoid_input_path, uniprot_substitution, dict_uniprot_map) :
+def create_inp_map(inparanoid_input_path, dict_uniprot_map) :
#----------------------------------------------------------------------------------------------------------------------
"""
open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
if int(cols[1][-1]) > 1 :
continue
os_locus = cols[0].rstrip("1").rstrip(".")
+ # swap loc for uniprot, if specified
+ if dict_uniprot_map :
+ os_locus = dict_uniprot_map[os_locus]
prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
if os_locus in dict_inp_map :
dict_inp_map[os_locus].add(prj_locus)
#----------------------------------------------------------------------------------------------------------------------
-def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id, uniprot_substitution, dict_uniprot_map) :
+def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id, dict_uniprot_map) :
#----------------------------------------------------------------------------------------------------------------------
"""
open the ensemble plants and rap::irgsp mapping files and generate a hash mapping of reference to projected loci where
cols = line.rstrip().split()
if len(cols) == 5 :
if cols[0] in dict_rap_map :
- if dict_rap_map[cols[0]] in loci_filter :
+ os_locus = dict_rap_map[cols[0]]
+ if os_locus in loci_filter :
+ # swap loc for uniprot, if specified
+ if dict_uniprot_map :
+ os_locus = dict_uniprot_map[os_locus]
# reciprocal identity is >= recip_id%, high confidence
if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 :
- if dict_rap_map[cols[0]] in dict_ens_map :
- dict_ens_map[dict_rap_map[cols[0]]].add(cols[1])
+ if os_locus in dict_ens_map :
+ dict_ens_map[os_locus].add(cols[1])
else :
- dict_ens_map[dict_rap_map[cols[0]]] = set([cols[1]])
+ dict_ens_map[os_locus] = set([cols[1]])
ENS.close()
for k, v in dict_ens_map.iteritems() :
parser.add_argument('-i', '--inparanoid_input_path', help='inparanoid supercluster input file')
parser.add_argument('-m', '--rap_map_path', help='MSU-RAP mapping file')
parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage')
+parser.add_argument('-u', '--uniprot_substitution', help='substitute UniProt for reference loci')
# TODO: add an "inparanoid super-cluster vs. conventional input" flag
# output settings
-parser.add_argument('-u', '--uniprot_substitution', help='substitute UniProt for reference loci', action='store_true')
parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs')
parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs')
# create ref loci::UniProt map, if specified
if args.uniprot_substitution :
- dict_uniprot_map = create_dict_uniprot_map()
+ dict_uniprot_map = create_dict_uniprot_map(args.uniprot_substitution)
# create projection maps
-dict_inp_map = create_inp_map(args.inparanoid_input_path, args.uniprot_substitution, dict_uniprot_map)
-dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, args.uniprot_substitution, dict_uniprot_map)
+dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map)
+dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map)
# generate stats and output them
compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path)