import os
import sys
import argparse
+import re
#----------------------------------------------------------------------------------------------------------------------
# globals
#----------------------------------------------------------------------------------------------------------------------
-ens_map = {}
-inp_map = {}
+list_stats = []
+dict_ens_map = {}
+dict_inp_map = {}
#----------------------------------------------------------------------------------------------------------------------
# classes
#----------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------
-def create_ens_map(ortho_path, rap_map_path, recip_id) :
+def create_inp_map(inparanoid_input_path) :
#----------------------------------------------------------------------------------------------------------------------
"""
- open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where
- reciprocal identity is >= param% and confidence is high
- """
- """
- count_loc = 0
- count_prj = 0
- dict_ens_ids = {}
- ENS = open(path + input_file)
- ENS.readline();
- for line in ENS :
- cols = line.rstrip().split()
- if len(cols) == 5 :
- if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence
- if cols[0] in dict_ens_ids :
- dict_ens_ids[cols[0]].append(cols[1])
- else :
- dict_ens_ids[cols[0]] = [cols[1]]
- ENS.close()
-
- #for k, v in dict_ens_ids.iteritems() :
- # print k, v
-
- #OS_2_PRJ_MAP = open(path + output_file,'w')
-
- RAP_IRGSP = open(path + "RAP-MSU.txt")
- RAP_IRGSP.readline();
- for line in RAP_IRGSP:
- if line.strip() != "" :
- cols = line.rstrip().split()
- rap_id = cols[0].upper()
- if rap_id in dict_ens_ids :
- print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id])
- #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n")
- #count_prj += len(set(dict_ens_ids[rap_id]))
- #count_loc += 1
- RAP_IRGSP.close()
- #OS_2_PRJ_MAP.close()
-
- return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
+ open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci
"""
-#----------------------------------------------------------------------------------------------------------------------
-def create_inp_map(ortho_path) :
-#----------------------------------------------------------------------------------------------------------------------
- """
- open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci
- """
- """
- count_loc = 0
- count_prj = 0
- dict_inp_ids = {}
- INP = open(path + input_file)
- INP.readline();
+ dict_inp_map = {} # local ensembl orthology dict
+ count_loci = 0
+ count_projections = 0
+
+ INP = open(inparanoid_input_path)
for line in INP :
cols = line.rstrip().split()
# ignore non-canonical orthologs
- if int(cols[1].rsplit(".",1)[1]) > 1 :
+ if int(cols[1][-1]) > 1 :
continue
- os_locus = cols[0].rstrip(".1")
- prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes
- if os_locus in dict_inp_ids :
- dict_inp_ids[os_locus].append(prj_locus)
+ os_locus = cols[0].rstrip("1").rstrip(".")
+ prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes (i.e. '.#', '_T0#')
+ if os_locus in dict_inp_map :
+ dict_inp_map[os_locus].add(prj_locus)
else :
- dict_inp_ids[os_locus] = [prj_locus]
+ dict_inp_map[os_locus] = set([prj_locus])
INP.close()
- OS_2_PRJ_MAP = open(path + output_file,'w')
- for k, v in sorted(dict_inp_ids.iteritems()) :
+ for k, v in sorted(dict_inp_map.iteritems()) :
#print k, v
- OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n")
- count_prj += len(set(v))
- count_loc += 1
- OS_2_PRJ_MAP.close()
+ count_projections += len(set(v))
+ count_loci += 1
+
+ list_stats.extend([{"[Inparanoid Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}])
+
+ return dict_inp_map
- return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}]
#----------------------------------------------------------------------------------------------------------------------
-def compare_maps(ensembl_map, inparanoid_map, output_path) :
+def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_id) :
#----------------------------------------------------------------------------------------------------------------------
- """compare Ensembl and Inparanoid projection results"""
+ """
+ open the ensemble plants and rap::irgsp mapping files and generate a hash mapping of reference to projected loci where
+ reciprocal identity is >= recip_id% and confidence is high. also pre-filter against set of curated Plant Reactome
+ reference loci
+ """
+ dict_ens_map = {} # local ensembl orthology dict
+ dict_rap_map = {} # local MSU-RAP dict, w/ only filtered canonical LOC loci (orig. ".1")
+ count_loci = 0
+ count_projections = 0
+
+ # generate internal MSU-RAP map
+ RAP_MAP = open(rap_map_path)
+ for line in RAP_MAP:
+ if line.strip() != "" :
+ cols = line.rstrip().split()
+ rap_id = cols[0].upper()
+
+ set_loc_ids = set(cols[1].upper().split(","))
+ # select only the locus w/ a .1 suffix, if it exists
+ for loc_id in set_loc_ids :
+ curr_canon_locus = re.match('.*\.1', loc_id)
+ if curr_canon_locus :
+ dict_rap_map[rap_id] = curr_canon_locus.group(0).rstrip("1").rstrip(".")
+ break;
+ RAP_MAP.close()
+
+ #for k, v in dict_rap_map.iteritems() :
+ # print k, v
+
+ # generate ref loci filter
+ FILTER = open(filtering_loci_path)
+ loci_filter = set()
+ for line in FILTER :
+ loci_filter.add(line.rstrip())
+
+ #for locus in loci_filter :
+ # print locus
+
+ ENS = open(ensembl_input_path)
+ for line in ENS :
+ cols = line.rstrip().split()
+ if len(cols) == 5 :
+ if cols[0] in dict_rap_map :
+ if dict_rap_map[cols[0]] in loci_filter :
+ # reciprocal identity is >= recip_id%, high confidence
+ if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 :
+ if dict_rap_map[cols[0]] in dict_ens_map :
+ dict_ens_map[dict_rap_map[cols[0]]].add(cols[1])
+ else :
+ dict_ens_map[dict_rap_map[cols[0]]] = set([cols[1]])
+ ENS.close()
+
+ for k, v in dict_ens_map.iteritems() :
+ #print k, v
+ count_loci += 1
+ count_projections += len(v)
+
+ list_stats.extend([{"\n[Ensemble Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}])
+ return dict_ens_map
+
+
+#----------------------------------------------------------------------------------------------------------------------
+def compare_maps(dict_ens_map, dict_inp_map, comparison_file_path, ensembl_output_path, inparanoid_output_path) :
+#----------------------------------------------------------------------------------------------------------------------
+ """compare Ensembl and Inparanoid projection results"""
inc_os = 0 # Common Os loci
exc_ens_os = 0 # Exclusive Ensembl Os loci
exc_inp_os = 0 # Exclusive Inparanoid Os loci
set_ens_os_loci = set()
set_inp_os_loci = set()
- """
- ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]}
- """
+ #ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]}
+
# iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level
- ENS = open(path + input_file.split(',')[0])
- for line in ENS :
- cols = line.rstrip().split()
- os_locus = cols[0]
- prj_loci = cols[1].split(',')
+ for os_locus, prj_loci in dict_ens_map.iteritems() :
# build ref_dict
if os_locus in ref_dict :
# build list of os ens loci for set operations
set_ens_os_loci.add(os_locus)
- ENS.close()
- INP = open(path + input_file.split(',')[1])
- for line in INP :
- cols = line.rstrip().split()
- os_locus = cols[0]
- prj_loci = cols[1].split(',')
+ for os_locus, prj_loci in dict_inp_map.iteritems() :
# continue build of ref_dict
if os_locus in ref_dict :
# build list of os inp loci for set operations
set_inp_os_loci.add(os_locus)
- INP.close()
#print set_ens_os_loci & set_inp_os_loci
#for k, v in ref_dict.iteritems() :
# print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2])
- return [{"Common Os loci:": str(inc_os)},
+ list_stats.extend([{"\n[Comparison Stats]": ""},
+ {"Common Os loci:": str(inc_os)},
{"Exclusive Ensembl Os loci:": str(exc_ens_os)},
{"Exclusive Inparanoid Os loci:": str(exc_inp_os)},
- {"Total common projected loci:": str(total_common_prj)},
- {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
- {"Avg. common projected loci:": str(avg_prj_overlap)}]
- """
+ {"Total Os loci:": str(inc_os + exc_ens_os + exc_inp_os)},
+ {"Percentage Reference Loci Coverage (Ensemble):": str((float(inc_os + exc_ens_os)/float(inc_os + exc_ens_os + exc_inp_os)*100))},
+ {"":""},
+ {"Common projected loci:": str(total_common_prj)},
+ {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)},
+ {"Avg. common projected loci:": str(avg_prj_overlap)}])
+
+ COMP_STATS = open(comparison_file_path, 'w')
+ for item in list_stats :
+ for k, v in item.iteritems() :
+ COMP_STATS.write(k + " " + v + "\n")
+ COMP_STATS.close()
+
+ INP_OUT_FILE = open(inparanoid_output_path,'w')
+ INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w')
+ for k, v in sorted(dict_inp_map.iteritems()) :
+ INP_OUT_FILE.write(k + "\t" + ",".join(v) + "\n")
+ for projection in sorted(v) :
+ INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
+ INP_OUT_FILE.close()
+ INP_FLAT_OUT_FILE.close()
+
+ ENS_OUT_FILE = open(ensembl_output_path,'w')
+ ENS_FLAT_OUT_FILE = open(ensembl_output_path + ".flat",'w')
+ for k, v in sorted(dict_ens_map.iteritems()) :
+ ENS_OUT_FILE.write(k + "\t" + ",".join(v) + "\n")
+ for projection in sorted(v) :
+ ENS_FLAT_OUT_FILE.write(k + "\t" + projection + "\n")
+ ENS_OUT_FILE.close()
+ ENS_FLAT_OUT_FILE.close()
+
#----------------------------------------------------------------------------------------------------------------------
# main
#----------------------------------------------------------------------------------------------------------------------
# process args
parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.')
+# input settings
+parser.add_argument('-f', '--filtering_loci_path', help='list of curated reference loci user for filtering')
parser.add_argument('-e', '--ensembl_input_path', help='ensembl compara input file')
parser.add_argument('-i', '--inparanoid_input_path', help='inparanoid supercluster input file')
parser.add_argument('-m', '--rap_map_path', help='MSU-RAP mapping file')
parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage')
# TODO: add an "inparanoid super-cluster vs. conventional input" flag
+# output settings
parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons')
+parser.add_argument('-E', '--ensembl_output_path', help='output file containing flat (1-to-many) ensemble ortho pairs')
+parser.add_argument('-I', '--inparanoid_output_path', help='output file containing flat (1-to-many) inparanoid ortho pairs')
args = parser.parse_args()
-print args.accumulate(args.integers)
+#print args
# create projection maps
-ens_map = create_ens_map(args.ensembl_input_path, args.rap_map_path, args.reciprocal_id)
-inp_map = create_inp_map(args.inparanoid_input_path)
+dict_inp_map = create_inp_map(args.inparanoid_input_path)
+dict_ens_map = create_ens_map(args.filtering_loci_path, args.ensembl_input_path, args.rap_map_path, args.reciprocal_id)
# generate stats and output them
-compare_maps(ens_map, inp_map, comparison_file_path)
+compare_maps(dict_ens_map, dict_inp_map, args.comparison_file_path, args.ensembl_output_path, args.inparanoid_output_path)