From 18e464d41939860c7e21c9339c41bd385f4a719d Mon Sep 17 00:00:00 2001 From: preecej Date: Sat, 22 Mar 2014 00:38:05 +0000 Subject: [PATCH] major revision completed: now accepts all input as file args and generates output files svn path=/; revision=559 --- .../python_singletons/incomparanoid.py | 229 +++++++++++------- 1 file changed, 136 insertions(+), 93 deletions(-) diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py index 9a8a243..7bf3969 100755 --- a/Personnel/preecej/python_singletons/incomparanoid.py +++ b/Personnel/preecej/python_singletons/incomparanoid.py @@ -9,12 +9,14 @@ InComparaNoid: Script with different analysis methods used to compare Ensembl an import os import sys import argparse +import re #---------------------------------------------------------------------------------------------------------------------- # globals #---------------------------------------------------------------------------------------------------------------------- -ens_map = {} -inp_map = {} +list_stats = [] +dict_ens_map = {} +dict_inp_map = {} #---------------------------------------------------------------------------------------------------------------------- # classes @@ -44,89 +46,108 @@ class switch(object): #---------------------------------------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------------------------------------- -def create_ens_map(ortho_path, rap_map_path, recip_id) : +def create_inp_map(inparanoid_input_path) : #---------------------------------------------------------------------------------------------------------------------- """ - open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where - reciprocal identity is >= param% and confidence is high - """ - """ - count_loc = 0 - count_prj = 0 - dict_ens_ids = {} - ENS = open(path + input_file) - ENS.readline(); - for line in ENS : - cols = line.rstrip().split() - if len(cols) == 5 : - if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence - if cols[0] in dict_ens_ids : - dict_ens_ids[cols[0]].append(cols[1]) - else : - dict_ens_ids[cols[0]] = [cols[1]] - ENS.close() - - #for k, v in dict_ens_ids.iteritems() : - # print k, v - - #OS_2_PRJ_MAP = open(path + output_file,'w') - - RAP_IRGSP = open(path + "RAP-MSU.txt") - RAP_IRGSP.readline(); - for line in RAP_IRGSP: - if line.strip() != "" : - cols = line.rstrip().split() - rap_id = cols[0].upper() - if rap_id in dict_ens_ids : - print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id]) - #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n") - #count_prj += len(set(dict_ens_ids[rap_id])) - #count_loc += 1 - RAP_IRGSP.close() - #OS_2_PRJ_MAP.close() - - return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}] + open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci """ -#---------------------------------------------------------------------------------------------------------------------- -def create_inp_map(ortho_path) : -#---------------------------------------------------------------------------------------------------------------------- - """ - open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci - """ - """ - count_loc = 0 - count_prj = 0 - dict_inp_ids = {} - INP = open(path + input_file) - INP.readline(); + dict_inp_map = {} # local ensembl orthology dict + count_loci = 0 + count_projections = 0 + + INP = open(inparanoid_input_path) for line in INP : cols = line.rstrip().split() # ignore non-canonical orthologs - if int(cols[1].rsplit(".",1)[1]) > 1 : + if int(cols[1][-1]) > 1 : continue - os_locus = cols[0].rstrip(".1") - prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes - if os_locus in dict_inp_ids : - dict_inp_ids[os_locus].append(prj_locus) + os_locus = cols[0].rstrip("1").rstrip(".") + prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#') + if os_locus in dict_inp_map : + dict_inp_map[os_locus].add(prj_locus) else : - dict_inp_ids[os_locus] = [prj_locus] + dict_inp_map[os_locus] = set([prj_locus]) INP.close() - OS_2_PRJ_MAP = open(path + output_file,'w') - for k, v in sorted(dict_inp_ids.iteritems()) : + for k, v in sorted(dict_inp_map.iteritems()) : #print k, v - OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n") - count_prj += len(set(v)) - count_loc += 1 - OS_2_PRJ_MAP.close() + count_projections += len(set(v)) + count_loci += 1 + + list_stats.extend([{"[Inparanoid Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}]) + + return dict_inp_map - return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}] #---------------------------------------------------------------------------------------------------------------------- -def compare_maps(ensembl_map, inparanoid_map, output_path) : +def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id) : #---------------------------------------------------------------------------------------------------------------------- - """compare Ensembl and Inparanoid projection results""" + """ + open the ensemble plants and rap::irgsp mapping files and generate a hash mapping of reference to projected loci where + reciprocal identity is >= recip_id% and confidence is high. also pre-filter against set of curated Plant Reactome + reference loci + """ + dict_ens_map = {} # local ensembl orthology dict + dict_rap_map = {} # local MSU-RAP dict, w/ only filtered canonical LOC loci (orig. ".1") + count_loci = 0 + count_projections = 0 + + # generate internal MSU-RAP map + RAP_MAP = open(rap_map_path) + for line in RAP_MAP: + if line.strip() != "" : + cols = line.rstrip().split() + rap_id = cols[0].upper() + + set_loc_ids = set(cols[1].upper().split(",")) + # select only the locus w/ a .1 suffix, if it exists + for loc_id in set_loc_ids : + curr_canon_locus = re.match('.*\.1', loc_id) + if curr_canon_locus : + dict_rap_map[rap_id] = curr_canon_locus.group(0).rstrip("1").rstrip(".") + break; + RAP_MAP.close() + + #for k, v in dict_rap_map.iteritems() : + # print k, v + + # generate ref loci filter + FILTER = open(filtering_loci_path) + loci_filter = set() + for line in FILTER : + loci_filter.add(line.rstrip()) + + #for locus in loci_filter : + # print locus + + ENS = open(ensembl_input_path) + for line in ENS : + cols = line.rstrip().split() + if len(cols) == 5 : + if cols[0] in dict_rap_map : + if dict_rap_map[cols[0]] in loci_filter : + # reciprocal identity is >= recip_id%, high confidence + if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : + if dict_rap_map[cols[0]] in dict_ens_map : + dict_ens_map[dict_rap_map[cols[0]]].add(cols[1]) + else : + dict_ens_map[dict_rap_map[cols[0]]] = set([cols[1]]) + ENS.close() + + for k, v in dict_ens_map.iteritems() : + #print k, v + count_loci += 1 + count_projections += len(v) + + list_stats.extend([{"\n[Ensemble Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}]) + return dict_ens_map + + +#---------------------------------------------------------------------------------------------------------------------- +def compare_maps(dict_ens_map, dict_inp_map, comparison_file_path, ensembl_output_path, inparanoid_output_path) : +#---------------------------------------------------------------------------------------------------------------------- + """compare Ensembl and Inparanoid projection results""" inc_os = 0 # Common Os loci exc_ens_os = 0 # Exclusive Ensembl Os loci exc_inp_os = 0 # Exclusive Inparanoid Os loci @@ -139,16 +160,11 @@ def compare_maps(ensembl_map, inparanoid_map, output_path) : set_ens_os_loci = set() set_inp_os_loci = set() - """ - ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]} - """ + #ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]} + # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level - ENS = open(path + input_file.split(',')[0]) - for line in ENS : - cols = line.rstrip().split() - os_locus = cols[0] - prj_loci = cols[1].split(',') + for os_locus, prj_loci in dict_ens_map.iteritems() : # build ref_dict if os_locus in ref_dict : @@ -158,13 +174,8 @@ def compare_maps(ensembl_map, inparanoid_map, output_path) : # build list of os ens loci for set operations set_ens_os_loci.add(os_locus) - ENS.close() - INP = open(path + input_file.split(',')[1]) - for line in INP : - cols = line.rstrip().split() - os_locus = cols[0] - prj_loci = cols[1].split(',') + for os_locus, prj_loci in dict_inp_map.iteritems() : # continue build of ref_dict if os_locus in ref_dict : @@ -174,7 +185,6 @@ def compare_maps(ensembl_map, inparanoid_map, output_path) : # build list of os inp loci for set operations set_inp_os_loci.add(os_locus) - INP.close() #print set_ens_os_loci & set_inp_os_loci @@ -197,13 +207,41 @@ def compare_maps(ensembl_map, inparanoid_map, output_path) : #for k, v in ref_dict.iteritems() : # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2]) - return [{"Common Os loci:": str(inc_os)}, + list_stats.extend([{"\n[Comparison Stats]": ""}, + {"Common Os loci:": str(inc_os)}, {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, - {"Total common projected loci:": str(total_common_prj)}, - {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)}, - {"Avg. common projected loci:": str(avg_prj_overlap)}] - """ + {"Total Os loci:": str(inc_os + exc_ens_os + exc_inp_os)}, + {"Percentage Reference Loci Coverage (Ensemble):": str((float(inc_os + exc_ens_os)/float(inc_os + exc_ens_os + exc_inp_os)*100))}, + {"":""}, + {"Common projected loci:": str(total_common_prj)}, + {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)}, + {"Avg. common projected loci:": str(avg_prj_overlap)}]) + + COMP_STATS = open(comparison_file_path, 'w') + for item in list_stats : + for k, v in item.iteritems() : + COMP_STATS.write(k + " " + v + "\n") + COMP_STATS.close() + + INP_OUT_FILE = open(inparanoid_output_path,'w') + INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w') + for k, v in sorted(dict_inp_map.iteritems()) : + INP_OUT_FILE.write(k + "\t" + ",".join(v) + "\n") + for projection in sorted(v) : + INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") + INP_OUT_FILE.close() + INP_FLAT_OUT_FILE.close() + + ENS_OUT_FILE = open(ensembl_output_path,'w') + ENS_FLAT_OUT_FILE = open(ensembl_output_path + ".flat",'w') + for k, v in sorted(dict_ens_map.iteritems()) : + ENS_OUT_FILE.write(k + "\t" + ",".join(v) + "\n") + for projection in sorted(v) : + ENS_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") + ENS_OUT_FILE.close() + ENS_FLAT_OUT_FILE.close() + #---------------------------------------------------------------------------------------------------------------------- # main #---------------------------------------------------------------------------------------------------------------------- @@ -211,21 +249,26 @@ def compare_maps(ensembl_map, inparanoid_map, output_path) : # process args parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.') +# input settings +parser.add_argument('-f', '--filtering_loci_path', help='list of curated reference loci user for filtering') parser.add_argument('-e', '--ensembl_input_path', help='ensembl compara input file') parser.add_argument('-i', '--inparanoid_input_path', help='inparanoid supercluster input file') parser.add_argument('-m', '--rap_map_path', help='MSU-RAP mapping file') parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage') # TODO: add an "inparanoid super-cluster vs. conventional input" flag +# output settings parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons') +parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs') +parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs') args = parser.parse_args() -print args.accumulate(args.integers) +#print args # create projection maps -ens_map = create_ens_map(args.ensembl_input_path, args.rap_map_path, args.reciprocal_id) -inp_map = create_inp_map(args.inparanoid_input_path) +dict_inp_map = create_inp_map(args.inparanoid_input_path) +dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id) # generate stats and output them -compare_maps(ens_map, inp_map, comparison_file_path) +compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path) -- 2.34.1