From: preecej Date: Fri, 21 Mar 2014 00:15:02 +0000 (+0000) Subject: WIP, not building at the moment. refactoring to accept ens, inp super X-Git-Url: http://gitweb.planteome.org/?a=commitdiff_plain;h=2b558b54e7c9a043e8f8647d71e9d93ccaa8acf7;p=old-jaiswallab-svn%2F.git WIP, not building at the moment. refactoring to accept ens, inp super cluster, and rap map files all at once, run in one shot, produce mapping and overlap stats files. future work: also produce reactome orthopair files and venne diagram. maybe build UI, too, just to learn how. svn path=/; revision=554 --- diff --git a/Personnel/preecej/python_singletons/map_os_2_at.py b/Personnel/preecej/python_singletons/map_os_2_at.py index b7deced..be86c39 100755 --- a/Personnel/preecej/python_singletons/map_os_2_at.py +++ b/Personnel/preecej/python_singletons/map_os_2_at.py @@ -8,14 +8,7 @@ Script with different analysis methods used to compare Ensembl and Inparanoid sp #---------------------------------------------------------------------------------------------------------------------- import os import sys - -#---------------------------------------------------------------------------------------------------------------------- -# globals -#---------------------------------------------------------------------------------------------------------------------- -path = os.getcwd() + "/" -input_file = sys.argv[1] -output_file = sys.argv[2] -map_source_type = sys.argv[3] +import argparse #---------------------------------------------------------------------------------------------------------------------- # classes @@ -45,11 +38,11 @@ class switch(object): #---------------------------------------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------------------------------------- -def ens_map(input_file, output_file) : +def ens_map(input_file, output_file, recip_id) : #---------------------------------------------------------------------------------------------------------------------- """ open the ensemble plants and rap::irgsp mapping files and generate a 2-col mapping of PRJ to LOC loci where - reciprocal identity is >= 50% and confidence is high + reciprocal identity is >= param% and confidence is high """ count_loc = 0 @@ -60,7 +53,7 @@ def ens_map(input_file, output_file) : for line in ENS : cols = line.rstrip().split() if len(cols) == 5 : - if int(cols[2]) >= 50 and int(cols[3]) >= 50 and int(cols[4]) == 1 : # reciprocal identity is >= 50%, high confidence + if int(cols[2]) >= recip_id and int(cols[3]) >= recip_id and int(cols[4]) == 1 : # reciprocal identity is >= recip_id%, high confidence if cols[0] in dict_ens_ids : dict_ens_ids[cols[0]].append(cols[1]) else : @@ -70,21 +63,21 @@ def ens_map(input_file, output_file) : #for k, v in dict_ens_ids.iteritems() : # print k, v - OS_2_PRJ_MAP = open(path + output_file,'w') + #OS_2_PRJ_MAP = open(path + output_file,'w') - RAP_IRGSP = open(path + "loc_rap_mappings_uniq.txt") + RAP_IRGSP = open(path + "RAP-MSU.txt") RAP_IRGSP.readline(); for line in RAP_IRGSP: if line.strip() != "" : cols = line.rstrip().split() - rap_id = cols[1].upper() - if rap_id in dict_ens_ids and rap_id != "NONE" : - #print cols[0] + "\t" + ",".join(dict_ens_ids[rap_id]) - OS_2_PRJ_MAP.write(cols[0].upper() + "\t" + ",".join(dict_ens_ids[rap_id]) + "\n") - count_prj += len(dict_ens_ids[rap_id]) - count_loc += 1 + rap_id = cols[0].upper() + if rap_id in dict_ens_ids : + print cols[1].split(",")[0] + "\t" + ",".join(dict_ens_ids[rap_id]) + #OS_2_PRJ_MAP.write(cols[1].upper() + "\t" + ",".join(set(dict_ens_ids[rap_id])) + "\n") + #count_prj += len(set(dict_ens_ids[rap_id])) + #count_loc += 1 RAP_IRGSP.close() - OS_2_PRJ_MAP.close() + #OS_2_PRJ_MAP.close() return [{"Unique Os loci: ": str(count_loc)}, {"Total projected loci:": str(count_prj)}] @@ -102,8 +95,11 @@ def inp_map(input_file, output_file) : INP.readline(); for line in INP : cols = line.rstrip().split() - os_locus = cols[1].rstrip(".1") - prj_locus = cols[2].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes + # ignore non-canonical orthologs + if int(cols[1].rsplit(".",1)[1]) > 1 : + continue + os_locus = cols[0].rstrip(".1") + prj_locus = cols[1].rsplit("_",1)[0].rsplit(".",1)[0] # remove any isoform suffixes if os_locus in dict_inp_ids : dict_inp_ids[os_locus].append(prj_locus) else : @@ -113,8 +109,8 @@ def inp_map(input_file, output_file) : OS_2_PRJ_MAP = open(path + output_file,'w') for k, v in sorted(dict_inp_ids.iteritems()) : #print k, v - OS_2_PRJ_MAP.write(k + "\t" + ",".join(v) + "\n") - count_prj += len(v) + OS_2_PRJ_MAP.write(k + "\t" + ",".join(set(v)) + "\n") + count_prj += len(set(v)) count_loc += 1 OS_2_PRJ_MAP.close() @@ -205,11 +201,29 @@ def compare_maps(input_file, output_file) : #---------------------------------------------------------------------------------------------------------------------- # main #---------------------------------------------------------------------------------------------------------------------- + +# process args +parser = argparse.ArgumentParser(description='Script with different analysis methods used to compare Ensembl and Inparanoid species projections.') + +parser.add_argument('-r', '--reciprocal_id', type=int, help='reciprocal identity percentage') +parser.add_argument('-e', '--ensembl_input', help='ensmbl input file') +parser.add_argument('-i', '--inparanoid_input', help='inparanoid input file') +parser.add_argument('-o', '--output', help='destination output file') + +args = parser.parse_args() +print args.accumulate(args.integers) + +input_file = +output_file = +map_source_type = +recip_id = sys.argv[4] + + list_counts = [] for case in switch(map_source_type): if case("ens"): - list_counts = ens_map(input_file, output_file) + list_counts = ens_map(input_file, output_file, recip_id) break if case("inp"): list_counts = inp_map(input_file, output_file)