From a0108266815084dd6dc0d70430e9b103a8a3ddf3 Mon Sep 17 00:00:00 2001 From: preecej Date: Sat, 8 Mar 2014 01:07:05 +0000 Subject: [PATCH] major development complete; now testing w/ different projected species svn path=/; revision=534 --- .../preecej/python_singletons/map_os_2_at.py | 255 +++++++++++++++--- 1 file changed, 222 insertions(+), 33 deletions(-) diff --git a/Personnel/preecej/python_singletons/map_os_2_at.py b/Personnel/preecej/python_singletons/map_os_2_at.py index 7a27776..b784e8e 100755 --- a/Personnel/preecej/python_singletons/map_os_2_at.py +++ b/Personnel/preecej/python_singletons/map_os_2_at.py @@ -1,37 +1,226 @@ #!/bin/python +""" +Script with different analysis methods used to compare Ensembl and Inparanoid species projections. +""" -# open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of AT to LOC loci where reciprocal identity is >= 50% and confidence is high - -path = "/home/preecej/Documents/projects/plant_reactome/plant_reactome_site/projection/rice_to_arabidopsis/" - -dict_ens_ids = {} - -ENS = open(path + "ensembl_plants_40_os_2_at_uniq.tab") -ENS.readline(); -for line in ENS : - cols = line.rstrip().split() - if len(cols) == 5 : - if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 : # reciprocal identity is >= 50%, high confidence - if cols[0] in dict_ens_ids.keys() : - dict_ens_ids[cols[0]].append(cols[1]) - else : - dict_ens_ids[cols[0]] = [cols[1]] -ENS.close() - -#for k, v in dict_ens_ids.iteritems() : -# print k, v - -OS_2_AT_MAP = open(path + "ensembl_ortho_os_2_at.tab",'w') - -RAP_IRGSP = open(path + "loc_rap_mappings.txt") -RAP_IRGSP.readline(); -for line in RAP_IRGSP: - if line.strip() != "" : +#---------------------------------------------------------------------------------------------------------------------- +# modules +#---------------------------------------------------------------------------------------------------------------------- +import os +import sys + +#---------------------------------------------------------------------------------------------------------------------- +# globals +#---------------------------------------------------------------------------------------------------------------------- +path = os.getcwd() + "/" +input_file = sys.argv[1] +output_file = sys.argv[2] +map_source_type = sys.argv[3] + +#---------------------------------------------------------------------------------------------------------------------- +# classes +#---------------------------------------------------------------------------------------------------------------------- +class switch(object): + def __init__(self, value): + self.value = value + self.fall = False + + def __iter__(self): + """Return the match method once, then stop""" + yield self.match + raise StopIteration + + def match(self, *args): + """Indicate whether or not to enter a case suite""" + if self.fall or not args: + return True + elif self.value in args: + self.fall = True + return True + else: + return False + +#---------------------------------------------------------------------------------------------------------------------- +# functions +#---------------------------------------------------------------------------------------------------------------------- + +#---------------------------------------------------------------------------------------------------------------------- +def ens_map(input_file, output_file) : +#---------------------------------------------------------------------------------------------------------------------- + """ + open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where + reciprocal identity is >= 50% and confidence is high + """ + + count_loc = 0 + count_prj = 0 + dict_ens_ids = {} + ENS = open(path + input_file) + ENS.readline(); + for line in ENS : + cols = line.rstrip().split() + if len(cols) == 5 : + if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 : # reciprocal identity is >= 50%, high confidence + if cols[0] in dict_ens_ids.keys() : + dict_ens_ids[cols[0]].append(cols[1]) + else : + dict_ens_ids[cols[0]] = [cols[1]] + ENS.close() + + #for k, v in dict_ens_ids.iteritems() : + # print k, v + + OS_2_PRJ_MAP = open(path + output_file,'w') + + RAP_IRGSP = open(path + "loc_rap_mappings_uniq.txt") + RAP_IRGSP.readline(); + for line in RAP_IRGSP: + if line.strip() != "" : + cols = line.rstrip().split() + rap_id = cols[1].upper() + if rap_id in dict_ens_ids and rap_id != "NONE" : + #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) + OS_2_PRJ_MAP.write(cols[0].upper() + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n") + count_prj += len(dict_ens_ids[rap_id]) + count_loc += 1 + RAP_IRGSP.close() + OS_2_PRJ_MAP.close() + + return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}] + +#---------------------------------------------------------------------------------------------------------------------- +def inp_map(input_file, output_file) : +#---------------------------------------------------------------------------------------------------------------------- + """ + open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci + """ + + count_loc = 0 + count_prj = 0 + dict_inp_ids = {} + INP = open(path + input_file) + INP.readline(); + for line in INP : + cols = line.rstrip().split() + os_locus = cols[1].rstrip(".1") + prj_locus = cols[2].rstrip(".1") + if os_locus in dict_inp_ids.keys() : + dict_inp_ids[os_locus].append(prj_locus) + else : + dict_inp_ids[os_locus] = [prj_locus] + INP.close() + + OS_2_PRJ_MAP = open(path + output_file,'w') + for k, v in sorted(dict_inp_ids.iteritems()) : + #print k, v + OS_2_PRJ_MAP.write(k + "\t" + ",".join(v) + "\n") + count_prj += len(v) + count_loc += 1 + OS_2_PRJ_MAP.close() + + return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}] + +#---------------------------------------------------------------------------------------------------------------------- +def compare_maps(input_file, output_file) : +#---------------------------------------------------------------------------------------------------------------------- + """compare Ensembl and Inparanoid projection results""" + + inc_os = 0 # Common Os loci + exc_ens_os = 0 # Exclusive Ensembl Os loci + exc_inp_os = 0 # Exclusive Inparanoid Os loci + total_common_prj = 0 # Total Common projected loci + inc_common_prj = 0 # Common projected loci where present in both Ens and Inp + avg_prj_overlap = 0.0 # Avg. common projected loci + avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp + + ref_dict = {} + set_ens_os_loci = set() + set_inp_os_loci = set() + + """ + ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]} + """ + # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level + + ENS = open(path + input_file.split(',')[0]) + for line in ENS : cols = line.rstrip().split() - rap_id = cols[1].upper() - if rap_id in dict_ens_ids and rap_id != "NONE" : - #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) - OS_2_AT_MAP.write(cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n") -RAP_IRGSP.close() + os_locus = cols[0] + prj_loci = cols[1].split(',') + + # build ref_dict + if os_locus in ref_dict.keys() : + ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot + else : + ref_dict[os_locus] = [prj_loci, []] + + # build list of os ens loci for set operations + set_ens_os_loci.add(os_locus) + ENS.close() + + INP = open(path + input_file.split(',')[1]) + for line in INP : + cols = line.rstrip().split() + os_locus = cols[0] + prj_loci = cols[1].split(',') + + # continue build of ref_dict + if os_locus in ref_dict.keys() : + ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot + else : + ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot + + # build list of os inp loci for set operations + set_inp_os_loci.add(os_locus) + INP.close() + + #print set_ens_os_loci & set_inp_os_loci + + # calculate common projected loci overlap sets + for k, v in ref_dict.iteritems() : + curr_comm_prj_loci = len(set(v[0]) & set(v[1])) + v.append(curr_comm_prj_loci) + total_common_prj += curr_comm_prj_loci + if curr_comm_prj_loci > 0 : + inc_common_prj += curr_comm_prj_loci + + inc_os = len(set_ens_os_loci & set_inp_os_loci) + exc_ens_os = len(set_ens_os_loci - set_inp_os_loci) + exc_inp_os = len(set_inp_os_loci - set_ens_os_loci) + + # calculate avg. common projected loci overlaps + avg_prj_overlap = float(total_common_prj) / float(len(ref_dict)) + avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os) + + #for k, v in ref_dict.iteritems() : + # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2]) + + return [{"Common Os loci:": str(inc_os)}, + {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, + {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, + {"Total common projected loci:": str(total_common_prj)}, + {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)}, + {"Avg. common projected loci:": str(avg_prj_overlap)}] + +#---------------------------------------------------------------------------------------------------------------------- +# main +#---------------------------------------------------------------------------------------------------------------------- +list_counts = [] + +for case in switch(map_source_type): + if case("ens"): + list_counts = ens_map(input_file, output_file) + break + if case("inp"): + list_counts = inp_map(input_file, output_file) + break + if case("compare"): + list_counts = compare_maps(input_file, output_file) # input_file is actually two comma-del input files (ens 1st, inp 2nd) + break + if case(): # default, could also just omit condition or 'if True' + print "Must provide a valid map source type: 'ens' (Ensembl), 'inp' (Inparanoid), or 'compare'" + +for item in list_counts : + for k, v in item.iteritems(): + print k, v -OS_2_AT_MAP.close() -- 2.34.1