From 11f0b52e5dc8b11909500fb369d5af0fecfa54f7 Mon Sep 17 00:00:00 2001 From: preecej Date: Wed, 23 Mar 2016 01:28:28 +0000 Subject: [PATCH] now handles direct RAP::UniProt mapping for Ensembl data, not just RAP:MSU:UniProt. more inclusive svn path=/; revision=644 --- .../python_singletons/incomparanoid.py | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py index 75e4856..2d1ed3a 100755 --- a/Personnel/preecej/python_singletons/incomparanoid.py +++ b/Personnel/preecej/python_singletons/incomparanoid.py @@ -53,21 +53,23 @@ def create_dict_uniprot_map(uniprot_substitution_path) : def create_inp_map(inparanoid_input_path, dict_uniprot_map) : #---------------------------------------------------------------------------------------------------------------------- """ - open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci + open the inparanoid file (which is already loci-filtered for curated reference set and does not require RAP-to-MSU back-conversion) and generate a 2-col mapping of PRJ to LOC loci """ dict_inp_map = {} # local ensembl orthology dict INP = open(inparanoid_input_path) for line in INP : cols = line.rstrip().split() + # for now, this next step is suspended. we are accepting all transcripts: isoformic, multiple sequencing lane IDs, etc. # ignore non-canonical orthologs - if int(cols[1][-1]) > 1 : - continue + #if int(cols[1][-1]) > 1 : + # continue os_locus = cols[0].rstrip("1").rstrip(".") # swap loc for uniprot, if specified if dict_uniprot_map : os_locus = dict_uniprot_map[os_locus] - prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#') + #prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0].rsplit("-",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#', '-#') + prj_locus = cols[1] if os_locus in dict_inp_map : dict_inp_map[os_locus].add(prj_locus) else : @@ -123,18 +125,29 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_ for line in ENS : cols = line.rstrip().split() if len(cols) == 5 : - if cols[0] in dict_rap_map : - os_locus = dict_rap_map[cols[0]] - if os_locus in loci_filter : - # swap loc for uniprot, if specified - if dict_uniprot_map : - os_locus = dict_uniprot_map[os_locus] + if dict_uniprot_map : + # if this ens os is already in uniprot list + if cols[0] in dict_uniprot_map : + # get uniprot id and build into ens_map + os_locus = dict_uniprot_map[cols[0]] # reciprocal identity is >= recip_id%, optional high confidence if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident : if os_locus in dict_ens_map : dict_ens_map[os_locus].add(cols[1]) else : dict_ens_map[os_locus] = set([cols[1]]) + else : + if cols[0] in dict_rap_map : + os_locus = dict_rap_map[cols[0]] + if os_locus in loci_filter : + # swap loc for uniprot + os_locus = dict_uniprot_map[os_locus] + # reciprocal identity is >= recip_id%, optional high confidence + if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) >= is_confident : + if os_locus in dict_ens_map : + dict_ens_map[os_locus].add(cols[1]) + else : + dict_ens_map[os_locus] = set([cols[1]]) ENS.close() return dict_ens_map @@ -387,7 +400,7 @@ parser.add_argument('--proj_species', help='projection species') # TODO: add an "inparanoid super-cluster vs. conventional input" flag # output settings -parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projectied protein identifiers', default='') # e.g. 'MaizeGDB:' +parser.add_argument('-p', '--projection_prefix', help='add a platform-specific prefix to the projected protein identifiers', default='') # e.g. 'MaizeGDB:' parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons') parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs') parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs') @@ -407,10 +420,11 @@ if args.uniprot_substitution : # create projection maps if (args.inparanoid_input_path) : dict_inp_map = create_inp_map(args.inparanoid_input_path, dict_uniprot_map) -dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0) +if (args.ensembl_input_path) : + dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id, dict_uniprot_map, 1 if args.confidence_high else 0) -# generate stats and output them -if (args.inparanoid_input_path) : +# generate stats and output them; assumes both inparanoid and ensembl data have been provided +if (args.comparison_file_path) : all_venn_data = compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path) if args.generate_reactome_output == 'ensembl' : -- 2.34.1