From: preecej Date: Fri, 21 Mar 2014 17:05:01 +0000 (+0000) Subject: rename program to make it more generic X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=ee9db74af90d9d0879f68dff527483e205a8b237;p=old-jaiswallab-svn%2F.git rename program to make it more generic svn path=/; revision=556 --- diff --git a/Personnel/preecej/python_singletons/incomparanoid.py b/Personnel/preecej/python_singletons/incomparanoid.py new file mode 100755 index 0000000..9a8a243 --- /dev/null +++ b/Personnel/preecej/python_singletons/incomparanoid.py @@ -0,0 +1,231 @@ +#!/bin/python +""" +InComparaNoid: Script with different analysis methods used to compare Ensembl and Inparanoid species projections. +""" + +#---------------------------------------------------------------------------------------------------------------------- +# modules +#---------------------------------------------------------------------------------------------------------------------- +import os +import sys +import argparse + +#---------------------------------------------------------------------------------------------------------------------- +# globals +#---------------------------------------------------------------------------------------------------------------------- +ens_map = {} +inp_map = {} + +#---------------------------------------------------------------------------------------------------------------------- +# classes +#---------------------------------------------------------------------------------------------------------------------- +class switch(object): + def __init__(self, value): + self.value = value + self.fall = False + + def __iter__(self): + """Return the match method once, then stop""" + yield self.match + raise StopIteration + + def match(self, *args): + """Indicate whether or not to enter a case suite""" + if self.fall or not args: + return True + elif self.value in args: + self.fall = True + return True + else: + return False + +#---------------------------------------------------------------------------------------------------------------------- +# functions +#---------------------------------------------------------------------------------------------------------------------- + +#---------------------------------------------------------------------------------------------------------------------- +def create_ens_map(ortho_path, rap_map_path, recip_id) : +#---------------------------------------------------------------------------------------------------------------------- + """ + open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where + reciprocal identity is >= param% and confidence is high + """ + """ + count_loc = 0 + count_prj = 0 + dict_ens_ids = {} + ENS = open(path + input_file) + ENS.readline(); + for line in ENS : + cols = line.rstrip().split() + if len(cols) == 5 : + if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence + if cols[0] in dict_ens_ids : + dict_ens_ids[cols[0]].append(cols[1]) + else : + dict_ens_ids[cols[0]] = [cols[1]] + ENS.close() + + #for k, v in dict_ens_ids.iteritems() : + # print k, v + + #OS_2_PRJ_MAP = open(path + output_file,'w') + + RAP_IRGSP = open(path + "RAP-MSU.txt") + RAP_IRGSP.readline(); + for line in RAP_IRGSP: + if line.strip() != "" : + cols = line.rstrip().split() + rap_id = cols[0].upper() + if rap_id in dict_ens_ids : + print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id]) + #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n") + #count_prj += len(set(dict_ens_ids[rap_id])) + #count_loc += 1 + RAP_IRGSP.close() + #OS_2_PRJ_MAP.close() + + return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}] + """ +#---------------------------------------------------------------------------------------------------------------------- +def create_inp_map(ortho_path) : +#---------------------------------------------------------------------------------------------------------------------- + """ + open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci + """ + """ + count_loc = 0 + count_prj = 0 + dict_inp_ids = {} + INP = open(path + input_file) + INP.readline(); + for line in INP : + cols = line.rstrip().split() + # ignore non-canonical orthologs + if int(cols[1].rsplit(".",1)[1]) > 1 : + continue + os_locus = cols[0].rstrip(".1") + prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes + if os_locus in dict_inp_ids : + dict_inp_ids[os_locus].append(prj_locus) + else : + dict_inp_ids[os_locus] = [prj_locus] + INP.close() + + OS_2_PRJ_MAP = open(path + output_file,'w') + for k, v in sorted(dict_inp_ids.iteritems()) : + #print k, v + OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n") + count_prj += len(set(v)) + count_loc += 1 + OS_2_PRJ_MAP.close() + + return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}] + +#---------------------------------------------------------------------------------------------------------------------- +def compare_maps(ensembl_map, inparanoid_map, output_path) : +#---------------------------------------------------------------------------------------------------------------------- + """compare Ensembl and Inparanoid projection results""" + + inc_os = 0 # Common Os loci + exc_ens_os = 0 # Exclusive Ensembl Os loci + exc_inp_os = 0 # Exclusive Inparanoid Os loci + total_common_prj = 0 # Total Common projected loci + inc_common_prj = 0 # Common projected loci where present in both Ens and Inp + avg_prj_overlap = 0.0 # Avg. common projected loci + avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp + + ref_dict = {} + set_ens_os_loci = set() + set_inp_os_loci = set() + + """ + ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]} + """ + # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level + + ENS = open(path + input_file.split(',')[0]) + for line in ENS : + cols = line.rstrip().split() + os_locus = cols[0] + prj_loci = cols[1].split(',') + + # build ref_dict + if os_locus in ref_dict : + ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot + else : + ref_dict[os_locus] = [prj_loci, []] + + # build list of os ens loci for set operations + set_ens_os_loci.add(os_locus) + ENS.close() + + INP = open(path + input_file.split(',')[1]) + for line in INP : + cols = line.rstrip().split() + os_locus = cols[0] + prj_loci = cols[1].split(',') + + # continue build of ref_dict + if os_locus in ref_dict : + ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot + else : + ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot + + # build list of os inp loci for set operations + set_inp_os_loci.add(os_locus) + INP.close() + + #print set_ens_os_loci & set_inp_os_loci + + # calculate common projected loci overlap sets + for k, v in ref_dict.iteritems() : + curr_comm_prj_loci = len(set(v[0]) & set(v[1])) + v.append(curr_comm_prj_loci) + total_common_prj += curr_comm_prj_loci + if curr_comm_prj_loci > 0 : + inc_common_prj += curr_comm_prj_loci + + inc_os = len(set_ens_os_loci & set_inp_os_loci) + exc_ens_os = len(set_ens_os_loci - set_inp_os_loci) + exc_inp_os = len(set_inp_os_loci - set_ens_os_loci) + + # calculate avg. common projected loci overlaps + avg_prj_overlap = float(total_common_prj) / float(len(ref_dict)) + avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os) + + #for k, v in ref_dict.iteritems() : + # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2]) + + return [{"Common Os loci:": str(inc_os)}, + {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, + {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, + {"Total common projected loci:": str(total_common_prj)}, + {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)}, + {"Avg. common projected loci:": str(avg_prj_overlap)}] + """ +#---------------------------------------------------------------------------------------------------------------------- +# main +#---------------------------------------------------------------------------------------------------------------------- + +# process args +parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.') + +parser.add_argument('-e', '--ensembl_input_path', help='ensembl compara input file') +parser.add_argument('-i', '--inparanoid_input_path', help='inparanoid supercluster input file') +parser.add_argument('-m', '--rap_map_path', help='MSU-RAP mapping file') +parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage') +# TODO: add an "inparanoid super-cluster vs. conventional input" flag + +parser.add_argument('-c', '--comparison_file_path', help='output file containing statistical comparisons') + +args = parser.parse_args() +print args.accumulate(args.integers) + +# create projection maps +ens_map = create_ens_map(args.ensembl_input_path, args.rap_map_path, args.reciprocal_id) +inp_map = create_inp_map(args.inparanoid_input_path) + +# generate stats and output them +compare_maps(ens_map, inp_map, comparison_file_path) + diff --git a/Personnel/preecej/python_singletons/map_os_2_at.py b/Personnel/preecej/python_singletons/map_os_2_at.py deleted file mode 100755 index be86c39..0000000 --- a/Personnel/preecej/python_singletons/map_os_2_at.py +++ /dev/null @@ -1,240 +0,0 @@ -#!/bin/python -""" -Script with different analysis methods used to compare Ensembl and Inparanoid species projections. -""" - -#---------------------------------------------------------------------------------------------------------------------- -# modules -#---------------------------------------------------------------------------------------------------------------------- -import os -import sys -import argparse - -#---------------------------------------------------------------------------------------------------------------------- -# classes -#---------------------------------------------------------------------------------------------------------------------- -class switch(object): - def __init__(self, value): - self.value = value - self.fall = False - - def __iter__(self): - """Return the match method once, then stop""" - yield self.match - raise StopIteration - - def match(self, *args): - """Indicate whether or not to enter a case suite""" - if self.fall or not args: - return True - elif self.value in args: - self.fall = True - return True - else: - return False - -#---------------------------------------------------------------------------------------------------------------------- -# functions -#---------------------------------------------------------------------------------------------------------------------- - -#---------------------------------------------------------------------------------------------------------------------- -def ens_map(input_file, output_file, recip_id) : -#---------------------------------------------------------------------------------------------------------------------- - """ - open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where - reciprocal identity is >= param% and confidence is high - """ - - count_loc = 0 - count_prj = 0 - dict_ens_ids = {} - ENS = open(path + input_file) - ENS.readline(); - for line in ENS : - cols = line.rstrip().split() - if len(cols) == 5 : - if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence - if cols[0] in dict_ens_ids : - dict_ens_ids[cols[0]].append(cols[1]) - else : - dict_ens_ids[cols[0]] = [cols[1]] - ENS.close() - - #for k, v in dict_ens_ids.iteritems() : - # print k, v - - #OS_2_PRJ_MAP = open(path + output_file,'w') - - RAP_IRGSP = open(path + "RAP-MSU.txt") - RAP_IRGSP.readline(); - for line in RAP_IRGSP: - if line.strip() != "" : - cols = line.rstrip().split() - rap_id = cols[0].upper() - if rap_id in dict_ens_ids : - print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id]) - #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n") - #count_prj += len(set(dict_ens_ids[rap_id])) - #count_loc += 1 - RAP_IRGSP.close() - #OS_2_PRJ_MAP.close() - - return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}] - -#---------------------------------------------------------------------------------------------------------------------- -def inp_map(input_file, output_file) : -#---------------------------------------------------------------------------------------------------------------------- - """ - open the inparanoid file and generate a 2-col mapping of PRJ to LOC loci - """ - - count_loc = 0 - count_prj = 0 - dict_inp_ids = {} - INP = open(path + input_file) - INP.readline(); - for line in INP : - cols = line.rstrip().split() - # ignore non-canonical orthologs - if int(cols[1].rsplit(".",1)[1]) > 1 : - continue - os_locus = cols[0].rstrip(".1") - prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes - if os_locus in dict_inp_ids : - dict_inp_ids[os_locus].append(prj_locus) - else : - dict_inp_ids[os_locus] = [prj_locus] - INP.close() - - OS_2_PRJ_MAP = open(path + output_file,'w') - for k, v in sorted(dict_inp_ids.iteritems()) : - #print k, v - OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n") - count_prj += len(set(v)) - count_loc += 1 - OS_2_PRJ_MAP.close() - - return [{"Unique Os loci:": str(count_loc)}, {"Total projected loci:": str(count_prj)}] - -#---------------------------------------------------------------------------------------------------------------------- -def compare_maps(input_file, output_file) : -#---------------------------------------------------------------------------------------------------------------------- - """compare Ensembl and Inparanoid projection results""" - - inc_os = 0 # Common Os loci - exc_ens_os = 0 # Exclusive Ensembl Os loci - exc_inp_os = 0 # Exclusive Inparanoid Os loci - total_common_prj = 0 # Total Common projected loci - inc_common_prj = 0 # Common projected loci where present in both Ens and Inp - avg_prj_overlap = 0.0 # Avg. common projected loci - avg_inc_prj_overlap = 0.0 # Avg. common projected loci present in both Ens and Inp - - ref_dict = {} - set_ens_os_loci = set() - set_inp_os_loci = set() - - """ - ref_dict structure: {os locus : [[ens projected locus, ...], [inp projected locus, ...], # common loci]} - """ - # iterate over both map files and build an overlap map, counting inclusions and exclusions at the reference and projection level - - ENS = open(path + input_file.split(',')[0]) - for line in ENS : - cols = line.rstrip().split() - os_locus = cols[0] - prj_loci = cols[1].split(',') - - # build ref_dict - if os_locus in ref_dict : - ref_dict[os_locus][0].extend(prj_loci) # add ens projected loci in the first list slot - else : - ref_dict[os_locus] = [prj_loci, []] - - # build list of os ens loci for set operations - set_ens_os_loci.add(os_locus) - ENS.close() - - INP = open(path + input_file.split(',')[1]) - for line in INP : - cols = line.rstrip().split() - os_locus = cols[0] - prj_loci = cols[1].split(',') - - # continue build of ref_dict - if os_locus in ref_dict : - ref_dict[os_locus][1].extend(prj_loci) # put the inp projected loci in the second list slot - else : - ref_dict[os_locus] = [[], prj_loci] # create an empty slot for the ens projected loci, put the inp projected loci in the second list slot - - # build list of os inp loci for set operations - set_inp_os_loci.add(os_locus) - INP.close() - - #print set_ens_os_loci & set_inp_os_loci - - # calculate common projected loci overlap sets - for k, v in ref_dict.iteritems() : - curr_comm_prj_loci = len(set(v[0]) & set(v[1])) - v.append(curr_comm_prj_loci) - total_common_prj += curr_comm_prj_loci - if curr_comm_prj_loci > 0 : - inc_common_prj += curr_comm_prj_loci - - inc_os = len(set_ens_os_loci & set_inp_os_loci) - exc_ens_os = len(set_ens_os_loci - set_inp_os_loci) - exc_inp_os = len(set_inp_os_loci - set_ens_os_loci) - - # calculate avg. common projected loci overlaps - avg_prj_overlap = float(total_common_prj) / float(len(ref_dict)) - avg_inc_prj_overlap = float(inc_common_prj) / float(inc_os) - - #for k, v in ref_dict.iteritems() : - # print k + "\n Ens: " + str(v[0]) + "\n Inp: " + str(v[1]) + "\n Common: " + str(v[2]) - - return [{"Common Os loci:": str(inc_os)}, - {"Exclusive Ensembl Os loci:": str(exc_ens_os)}, - {"Exclusive Inparanoid Os loci:": str(exc_inp_os)}, - {"Total common projected loci:": str(total_common_prj)}, - {"Avg. inclusive common projected loci:": str(avg_inc_prj_overlap)}, - {"Avg. common projected loci:": str(avg_prj_overlap)}] - -#---------------------------------------------------------------------------------------------------------------------- -# main -#---------------------------------------------------------------------------------------------------------------------- - -# process args -parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.') - -parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage') -parser.add_argument('-e', '--ensembl_input', help='ensmbl input file') -parser.add_argument('-i', '--inparanoid_input', help='inparanoid input file') -parser.add_argument('-o', '--output', help='destination output file') - -args = parser.parse_args() -print args.accumulate(args.integers) - -input_file = -output_file = -map_source_type = -recip_id = sys.argv[4] - - -list_counts = [] - -for case in switch(map_source_type): - if case("ens"): - list_counts = ens_map(input_file, output_file, recip_id) - break - if case("inp"): - list_counts = inp_map(input_file, output_file) - break - if case("compare"): - list_counts = compare_maps(input_file, output_file) # input_file is actually two comma-del input files (ens 1st, inp 2nd) - break - if case(): # default, could also just omit condition or 'if True' - print "Must provide a valid map source type: 'ens' (Ensembl), 'inp' (Inparanoid), or 'compare'" - -for item in list_counts : - for k, v in item.iteritems(): - print k, v -