From: preecej Date: Sat, 5 Apr 2014 01:41:47 +0000 (+0000) Subject: complete refactor of projection comparison code; added more projection X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=78aace236712df95005368765b7c89714d34760e;p=old-jaiswallab-svn%2F.git complete refactor of projection comparison code; added more projection stats svn path=/; revision=569 --- diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py index 7841bd3..82d620d 100755 --- a/Personnel/preecej/python_singletons/incomparanoid.py +++ b/Personnel/preecej/python_singletons/incomparanoid.py @@ -19,29 +19,6 @@ dict_uniprot_map = {} dict_ens_map = {} dict_inp_map = {} -#---------------------------------------------------------------------------------------------------------------------- -# classes -#---------------------------------------------------------------------------------------------------------------------- -class switch(object): - def __init__(self, value): - self.value = value - self.fall = False - - def __iter__(self): - """Return the match method once, then stop""" - yield self.match - raise StopIteration - - def match(self, *args): - """Indicate whether or not to enter a case suite""" - if self.fall or not args: - return True - elif self.value in args: - self.fall = True - return True - else: - return False - #---------------------------------------------------------------------------------------------------------------------- # functions #---------------------------------------------------------------------------------------------------------------------- @@ -65,11 +42,9 @@ def create_dict_uniprot_map(uniprot_substitution_path) : dict_uniprot_map[loc] = uniprot UNI.close() - #for k, v in sorted(dict_uniprot_map.iteritems()) : - #print k, v - return dict_uniprot_map + #---------------------------------------------------------------------------------------------------------------------- def create_inp_map(inparanoid_input_path, dict_uniprot_map) : #---------------------------------------------------------------------------------------------------------------------- @@ -77,8 +52,6 @@ def create_inp_map(inparanoid_input_path, dict_uniprot_map) : open the inparanoid file (which is already loci-filtered for curated reference set) and generate a 2-col mapping of PRJ to LOC loci """ dict_inp_map = {} # local ensembl orthology dict - count_loci = 0 - count_projections = 0 INP = open(inparanoid_input_path) for line in INP : @@ -96,13 +69,6 @@ def create_inp_map(inparanoid_input_path, dict_uniprot_map) : else : dict_inp_map[os_locus] = set([prj_locus]) INP.close() - - for k, v in sorted(dict_inp_map.iteritems()) : - #print k, v - count_projections += len(set(v)) - count_loci += 1 - - list_stats.extend([{"[Inparanoid Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}]) return dict_inp_map @@ -117,8 +83,6 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_ """ dict_ens_map = {} # local ensembl orthology dict dict_rap_map = {} # local MSU-RAP dict, w/ only filtered canonical LOC loci (orig. ".1") - count_loci = 0 - count_projections = 0 # generate internal MSU-RAP map RAP_MAP = open(rap_map_path) @@ -136,9 +100,6 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_ break; RAP_MAP.close() - #for k, v in dict_rap_map.iteritems() : - # print k, v - # generate ref loci filter FILTER = open(filtering_loci_path) loci_filter = set() @@ -165,114 +126,136 @@ def create_ens_map(filtering_loci_path, ensembl_input_path, rap_map_path, recip_ else : dict_ens_map[os_locus] = set([cols[1]]) ENS.close() - - for k, v in dict_ens_map.iteritems() : - #print k, v - count_loci += 1 - count_projections += len(v) - - list_stats.extend([{"\n[Ensemble Stats]": ""}, {"Unique Os loci: ": str(count_loci)}, {"Total projected loci:": str(count_projections)}]) return dict_ens_map #---------------------------------------------------------------------------------------------------------------------- -def compare_maps(dict_ens_map, dict_inp_map, comparison_file_path, ensembl_output_path, inparanoid_output_path) : +def compare_maps(dict_cmp_map, dict_inp_map, comparison_file_path, compara_output_path, inparanoid_output_path) : #---------------------------------------------------------------------------------------------------------------------- """compare Ensembl and Inparanoid projection results""" - inc_os = 0 # Common Os loci - exc_ens_os = 0 # Exclusive Ensembl Os loci - exc_inp_os = 0 # Exclusive Inparanoid Os loci - total_common_prj = 0 # Total Common projected loci - inc_common_prj = 0 # Common projected loci where present in both Ens and Inp - avg_prj_overlap = 0.0 # Avg. common projected loci - avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp + + # reference loci + inp_ref_loci = len(dict_inp_map.keys()) # inparanoid ref loci + cmp_ref_loci = len(dict_cmp_map.keys()) # compara ref loci - ref_dict = {} - set_ens_os_loci = set() - set_inp_os_loci = set() + set_inp_ref_loci = set(dict_inp_map.keys()) + set_cmp_ref_loci = set(dict_cmp_map.keys()) - #ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]} + inp_exc_ref_loci = len(set_inp_ref_loci - set_cmp_ref_loci) # inparanoid exclusive ref loci + cmp_exc_ref_loci = len(set_cmp_ref_loci - set_inp_ref_loci) # compara exclusive ref loci + union_ref_loci = len(set_inp_ref_loci | set_cmp_ref_loci) # all ref loci + intersection_ref_loci = len(set_inp_ref_loci & set_cmp_ref_loci) # all shared ref loci - # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level + inp_ref_loci_coverage = '{:.2%}'.format(inp_ref_loci/float(union_ref_loci)) + cmp_ref_loci_coverage = '{:.2%}'.format(cmp_ref_loci/float(union_ref_loci)) - for os_locus, prj_loci in dict_ens_map.iteritems() : + """ + projected loci (all non-unique, meaning recounted relative to each ref locus; in other words, + not a collapsed set of projections that would ignore 1-to-many ref loci projections + """ + inp_prj_loci = sum([len(v) for v in dict_inp_map.itervalues()]) # total inparanoid projected loci + cmp_prj_loci = sum([len(v) for v in dict_cmp_map.itervalues()]) # total compara projected loci - # build ref_dict - if os_locus in ref_dict : - ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot + # iterate over both map files and build an overlap map, for the purpose of counting inclusive and exclusive projections by locus + # ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # of common loci]} + ref_dict = {} + + # build ref_dict (Compara) + for ref_locus, prj_loci in dict_cmp_map.iteritems() : + if ref_locus in ref_dict : + ref_dict[ref_locus][0].extend(prj_loci) # add Compara projected loci in the first list slot else : - ref_dict[os_locus] = [prj_loci, []] - - # build list of os ens loci for set operations - set_ens_os_loci.add(os_locus) + ref_dict[ref_locus] = [prj_loci, []] - for os_locus, prj_loci in dict_inp_map.iteritems() : - - # continue build of ref_dict - if os_locus in ref_dict : - ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot + # continue build of ref_dict (Inparanoid) + for ref_locus, prj_loci in dict_inp_map.iteritems() : + if ref_locus in ref_dict : + ref_dict[ref_locus][1].extend(prj_loci) # put the Inparanoid projected loci in the second list slot else : - ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot - - # build list of os inp loci for set operations - set_inp_os_loci.add(os_locus) + ref_dict[ref_locus] = [[], prj_loci] # create an empty slot for the Compara projected loci, put the Inparanoid projected loci in the second list slot - #print set_ens_os_loci & set_inp_os_loci + inp_exc_prj_loci = 0 # inparanoid exclusive projected loci + cmp_exc_prj_loci = 0 # compara exclusive projected loci + union_prj_loci = 0 # all projections, duplicates removed only at the ref locus level + intersection_prj_loci = 0 # all overlapping projections # calculate common projected loci overlap sets for k, v in ref_dict.iteritems() : - curr_comm_prj_loci = len(set(v[0]) & set(v[1])) - v.append(curr_comm_prj_loci) - total_common_prj += curr_comm_prj_loci - if curr_comm_prj_loci > 0 : - inc_common_prj += curr_comm_prj_loci + curr_intersection_prj_loci = len(set(v[0]) & set(v[1])) + v.append(curr_intersection_prj_loci) + intersection_prj_loci += curr_intersection_prj_loci + + union_prj_loci += len(set(v[0]) | set(v[1])) + inp_exc_prj_loci += len(set(v[1]) - set(v[0])) + cmp_exc_prj_loci += len(set(v[0]) - set(v[1])) - inc_os = len(set_ens_os_loci & set_inp_os_loci) - exc_ens_os = len(set_ens_os_loci - set_inp_os_loci) - exc_inp_os = len(set_inp_os_loci - set_ens_os_loci) + inp_prj_loci_coverage = '{:.2%}'.format(inp_prj_loci/float(union_prj_loci)) + cmp_prj_loci_coverage = '{:.2%}'.format(cmp_prj_loci/float(union_prj_loci)) + + # other stats + #avg_projections_per_locus + #avg_overlapping_projections_per_locus # calculate avg. common projected loci overlaps - avg_prj_overlap = float(total_common_prj) / float(len(ref_dict)) - avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os) - - #for k, v in ref_dict.iteritems() : - # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2]) - - list_stats.extend([{"\n[Comparison Stats]": ""}, - {"Common Os loci:": str(inc_os)}, - {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, - {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, - {"Total Os loci:": str(inc_os + exc_ens_os + exc_inp_os)}, - {"Percentage Reference Loci Coverage (Ensemble):": str((float(inc_os + exc_ens_os)/float(inc_os + exc_ens_os + exc_inp_os)*100))}, - {"":""}, - {"Common projected loci:": str(total_common_prj)}, - {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)}, - {"Avg. common projected loci:": str(avg_prj_overlap)}]) - - COMP_STATS = open(comparison_file_path, 'w') + #avg_prj_overlap = float(total_intersection_projections) / float(len(ref_dict)) + #avg_inc_prj_overlap = float(total_union_projections) / float(inc_os) + + list_stats.extend([{"[Comparison Stats]": ""}, + + {"\n-- Reference Loci --":""}, + {"Inparanoid reference loci:": str(inp_ref_loci)}, + {"Compara reference loci:": str(cmp_ref_loci)}, + {"Exclusive Inparanoid reference loci:": str(inp_exc_ref_loci)}, + {"Exclusive Compara reference loci:": str(cmp_exc_ref_loci)}, + {"All reference loci (union):": str(union_ref_loci)}, + {"All shared reference loci (intersection):": str(intersection_ref_loci)}, + {"Inparanoid reference loci coverage:": inp_ref_loci_coverage}, + {"Compara reference loci coverage:": cmp_ref_loci_coverage}, + + {"\n-- Projected Loci (all non-unique with respect to reference loci) --":""}, + {"Inparanoid projected loci:": str(inp_prj_loci)}, + {"Compara projected loci:": str(cmp_prj_loci)}, + {"Inparanoid exclusive projected loci:": str(inp_exc_prj_loci)}, + {"Compara exclusive projected loci:": str(cmp_exc_prj_loci)}, + {"All projected loci (union):": str(union_prj_loci)}, + {"All shared projected loci (intersection):": str(intersection_prj_loci)}, + {"Inparanoid projected loci coverage:": inp_prj_loci_coverage}, + {"Compara projected loci coverage:": cmp_prj_loci_coverage}, + + ]) + for item in list_stats : for k, v in item.iteritems() : - COMP_STATS.write(k + " " + v + "\n") - COMP_STATS.close() - - INP_OUT_FILE = open(inparanoid_output_path,'w') - INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w') - for k, v in sorted(dict_inp_map.iteritems()) : - INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n") - for projection in sorted(v) : - INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") - INP_OUT_FILE.close() - INP_FLAT_OUT_FILE.close() - - ENS_OUT_FILE = open(ensembl_output_path,'w') - ENS_FLAT_OUT_FILE = open(ensembl_output_path + ".flat",'w') - for k, v in sorted(dict_ens_map.iteritems()) : - ENS_OUT_FILE.write(k + "\t" + " ".join(v) + "\n") - for projection in sorted(v) : - ENS_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") - ENS_OUT_FILE.close() - ENS_FLAT_OUT_FILE.close() + print(k + " " + v) + + if comparison_file_path : + COMP_STATS = open(comparison_file_path, 'w') + for item in list_stats : + for k, v in item.iteritems() : + COMP_STATS.write(k + " " + v + "\n") + COMP_STATS.close() + + if inparanoid_output_path : + INP_OUT_FILE = open(inparanoid_output_path,'w') + INP_FLAT_OUT_FILE = open(inparanoid_output_path + ".flat",'w') + for k, v in sorted(dict_inp_map.iteritems()) : + INP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n") + for projection in sorted(v) : + INP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") + INP_OUT_FILE.close() + INP_FLAT_OUT_FILE.close() + + if compara_output_path : + CMP_OUT_FILE = open(compara_output_path,'w') + CMP_FLAT_OUT_FILE = open(compara_output_path + ".flat",'w') + for k, v in sorted(dict_cmp_map.iteritems()) : + CMP_OUT_FILE.write(k + "\t" + " ".join(v) + "\n") + for projection in sorted(v) : + CMP_FLAT_OUT_FILE.write(k + "\t" + projection + "\n") + CMP_OUT_FILE.close() + CMP_FLAT_OUT_FILE.close() + #---------------------------------------------------------------------------------------------------------------------- def write_reactome_files(dict_map, reactome_gene_protein_path, reactome_projection_path, projection_prefix) :