From e35d6e9662cb6f20d730bd531ad8e7ad28a18bd7 Mon Sep 17 00:00:00 2001 From: preecej Date: Thu, 28 Jul 2011 18:36:18 +0000 Subject: [PATCH] Completed data read-in of GAF file (paw_TransformForImport.pl) svn path=/; revision=126 --- preecej/php_singletons/cmd_line_test.php | 14 + .../semantic_wiki/paw_TransformForImport.pl | 330 +++++++++++++++--- 2 files changed, 287 insertions(+), 57 deletions(-) create mode 100644 preecej/php_singletons/cmd_line_test.php diff --git a/preecej/php_singletons/cmd_line_test.php b/preecej/php_singletons/cmd_line_test.php new file mode 100644 index 0000000..0a58b4e --- /dev/null +++ b/preecej/php_singletons/cmd_line_test.php @@ -0,0 +1,14 @@ +load('/home/preecej/Documents/projects/pathvisio/Ath_scratch.gpml'); + +$entry = $doc->getElementsByTagName("Pathway"); +$author = $entry->item(0)->getAttribute("Author"); +print "Author:$author\n"; + +print "test over\n"; + +?> diff --git a/preecej/semantic_wiki/paw_TransformForImport.pl b/preecej/semantic_wiki/paw_TransformForImport.pl index 92a4713..7605392 100644 --- a/preecej/semantic_wiki/paw_TransformForImport.pl +++ b/preecej/semantic_wiki/paw_TransformForImport.pl @@ -94,6 +94,7 @@ use strict; use Cwd; use Switch; use Getopt::Std; +use Data::Dumper; # specific use XML::Smart; @@ -111,16 +112,31 @@ my $output_file; my $verbose = 0; # flag for verbose output my $debug = 0; # debugging switch -# data storage +# global data storage ---------- + +# universal my %source; +my $xml; +my $output_data; +my %aspects; + +# tab, csv my $template_name; my @field_names; my @field_data; -my $xml; -my $output_data; -my $source_title_seed = 3; -my $annot_title_seed = 3; +# gaf +my %annotation_universals; # holds values assumed not to vary across the file +my %annotations; # keyed on Gene Symbol + +# other config constants + +# page id seeds, until I can figure out how to auto-increment w/in the import +# script +my $SOURCE_TITLE_SEED = 3; +my $ANNOT_TITLE_SEED = 3; + +$Data::Dumper::Pad = "... "; # --------------------------------------------------------------------------- # functions @@ -188,19 +204,14 @@ sub init } -# read, parse, and store source, template, and annotations +# read, parse, and store generic CSV and tab templates and annotations # --------------------------------------------------------------------------- -sub import_data +sub import_generic { - print "Opening input file and reading header info...\n\n"; - - # open file - open(INPUT_FILE,$input_file) or die("Could not open input file."); - - my $line; # all-purpose line counter - - # read in the source data + # read in "[Format] section... + my $line; my $count = 0; + while () { $count++; @@ -208,39 +219,13 @@ sub import_data chomp $line; my $data_val = (split('=',$line))[1]; switch ($count) { - case 2 { $source{'SourceDateStamp'} = $data_val; } - case 3 { $source{'SourceDatabase'} = $data_val; } - case 4 { $source{'SourceVersion'} = $data_val; } - case 5 { $source{'SourceURI'} = $data_val; } - case 6 { $source{'SourceFile'} = $data_val; } + case 2 { $template_name = $data_val; } + case 3 { @field_names = split($file_del,$data_val); } else {;} } - if ($count == 6) { last; } - } - - # read in "[Format] section if filetype is tab or csv" - $count++; - if ($file_type =~ /(csv)|(tab)/) { - - $count = 0; - while () - { - $count++; - $line = $_; - chomp $line; - my $data_val = (split('=',$line))[1]; - switch ($count) { - case 2 { $template_name = $data_val; } - case 3 { @field_names = split($file_del,$data_val); } - else {;} - } - if ($count == 3) { last; } - } + if ($count == 3) { last; } } - print "Reading data...\n\n"; - $line = ; # skip "[Data]" - # loop through data rows and add all data fields to an array of hashes while () { @@ -258,6 +243,200 @@ sub import_data } push @field_data, \%tmp_hash; } +} + +# read, parse, and store GAF annotations +# --------------------------------------------------------------------------- +sub import_gaf +{ + #[GAF implied data structure] + # singletons: db/Source (not really, but OK for now), taxon/Species ID, assigned_by, gene_type/Gene Type (later: proteins, too) + # unneeded: db_object_id, date, qualifier, db_reference (get PMID from here later), with_or_from (later) + # unvaried fields (gene-level): db_object_symbol/Gene Symbol, db_object_name/Gene Name, db_object_synonym/Gene Locus|Source Accession ID|Chromosome|Gene Synonyms (see below), + # varied fields (gene synonyms): db_object_synonym/Gene Synonym + # varied fields (ontology-level): term_id/Term ID, evidence_code/Evidence Code, aspect/Aspect, + + # [Annotation Object Structure] + # %annotation_universals + # "Source" + # "Species ID" + # "Gene Type" + # %annotations + # "$Gene Symbol" => %annotation_properties + # "Gene Name" => string + # "Gene Locus"/"Source Accession ID" (first uc("AT.G")) => string + # "Chromosome" (AT#G in "Gene Locus") => string + # "Gene Synonyms" => pipe-delimited string of synonyms + # "Ontological Reference" => %annotation_ontology_refs + # "Term ID" => string + # "Evidence Code" => string + # "Aspect" => string + + # loop through data rows and build hashed annotation data structure + my $count = 0; + + while () + { + $count++; + my $line = $_; + chomp $line; + + my @curr_line_ary = split("\t", $line); + if ($debug) { print "...\n" + . Dumper(\@curr_line_ary) . "\n\n"; } + + my %curr_line_hash = ( + "db" => $curr_line_ary[0], # Source + "db_object_symbol" => $curr_line_ary[2], # Gene Symbol + "term_id" => $curr_line_ary[4], # Term ID + "evidence_code" => $curr_line_ary[6], # Evidence Code + "aspect" => $curr_line_ary[8], # Aspect + "db_object_name" => $curr_line_ary[9], # Gene Name + + # Gene Locus, Source Accession ID, Chromosome, Gene Synonyms + "db_object_synonym" => $curr_line_ary[10], + + "db_object_type" => $curr_line_ary[11], # Gene Type + "taxon" => $curr_line_ary[12] # Species ID + ); + + if ($debug) { print "...\n" + . Dumper(\%curr_line_hash) . "\n\n"; } + + # grab the unvaried values from the first line + if ($count == 1) { + %annotation_universals = ( + "Source" => $curr_line_hash{"db"}, + "Gene Type" => $curr_line_hash{"db_object_type"}, + "Species" => "NCBI:" . (split(':',$curr_line_hash{"taxon"}))[1] + ); + + if ($debug) { print "...\n" + . Dumper(\%annotation_universals) . "\n\n"; } + + # identify what kind of ontology term we're dealing with and set our + # aspect names accordingly + switch ((split(':',$curr_line_hash{"term_id"}))[0]) { + case 'GO' { # Gene Ontology + %aspects = ( + P => "Biological Process", + C => "Cellular Component", + F => "Molecular Function" + ); + } + case 'PO' { # Plant Ontology + %aspects = ( + A => "Plant Anatomy", + G => "Plant Growth and Development Stage" + ); + } + } + } + + # check to see if Gene Symbol hash key exists (for grouping) + # if not, add the new Gene Symbol and its associated props + if (!exists $annotations{$curr_line_hash{"db_object_symbol"}}) + { + # prepare Gene Locus, Source Accession ID, Chromosome, Gene Synonyms + my @synonyms = split('\|',$curr_line_hash{"db_object_synonym"}); + + if ($debug) { print "...\n" + . Dumper(\@synonyms) . "\n\n"; } + + # find the gene locus, if it is listed (first "AT.G") + my @loci = grep /[Aa][Tt].[Gg]/, @synonyms; + + if ($debug) { print "...\n" + . Dumper(\@loci) . "\n\n"; } + + my $locus = ""; + + if (scalar(@loci) > 0) # we have at least one match; use the first one + { + $locus = $loci[0]; + } + else # no match; attempt to use the Gene Synonym instead + { + if ($curr_line_hash{"db_object_symbol"} =~ /[Aa][Tt].[Gg]/) + { + # the split drops the variant/allele signifier + $locus = (split('.',$curr_line_hash{"db_object_symbol"}))[0]; + } + } + # chromosome = third char in locus, if it exists + my $chromosome = ($locus ne "" ? (split('',$locus))[2] : ""); + + # set up props + my $annotation_properties = { + "Gene Name" => $curr_line_hash{"db_object_name"}, + "Gene Locus" => $locus, # also used for Source Accession ID + "Chromosome" => $chromosome, + "Gene Synonyms" => $curr_line_hash{"db_object_synonym"}, # pipe-delimited string + }; + + if ($debug) { print "...\n" + . Dumper($annotation_properties) . "\n\n"; } + + + # add new gene annotation and assign props + $annotations{$curr_line_hash{"db_object_symbol"}} = $annotation_properties; + } + + # add new ontology data (this happens on every line of data) + my $annotation_ontology_ref = { + "Term ID" => $curr_line_hash{"term_id"}, + "Aspect" => $aspects{uc($curr_line_hash{"aspect"})}, + "Evidence Code" => $curr_line_hash{"evidence_code"} + }; + + if ($debug) { print "...\n" + . Dumper($annotation_ontology_ref) . "\n\n"; } + + push @{$annotations{$curr_line_hash{"db_object_symbol"}}{"Ontological Reference"}}, $annotation_ontology_ref; + + } + if ($debug) { print "...\n" + . Dumper(\%annotations) . "\n\n"; } +} + + +# read, parse, and store source +# --------------------------------------------------------------------------- +sub import_data +{ + print "Opening input file and reading header info...\n\n"; + + # open file + open(INPUT_FILE,$input_file) or die("Could not open input file."); + + # read in the source data + my $count = 0; + my $line; + while () + { + $count++; + $line = $_; + chomp $line; + my $data_val = (split('=',$line))[1]; + switch ($count) { + case 2 { $source{'SourceDateStamp'} = $data_val; } + case 3 { $source{'SourceDatabase'} = $data_val; } + case 4 { $source{'SourceVersion'} = $data_val; } + case 5 { $source{'SourceURI'} = $data_val; } + case 6 { $source{'SourceFile'} = $data_val; } + else {;} + } + if ($count == 6) { last; } + } + + print "Reading data...\n\n"; + $line = ; # skip "[Data]" + + switch ($file_type) { + case ('csv' || 'tab') { import_generic(); } + case 'gaf' { import_gaf(); } + } + close INPUT_FILE; } @@ -272,17 +451,29 @@ sub show_input } print "\n"; - if ($file_type =~ /(csv)|(tab)/) { - print "[Template]\n$template_name\n\n"; - print "[Fields]\n" . join(', ',@field_names) . "\n\n"; - } - - print "[Data]\n"; - foreach my $row (@field_data) { - foreach my $key (keys %$row) { - print "$key => " . $row->{$key} . "\n"; + switch ($file_type) { + case ('csv' || 'tab') { + print "[Template]\n$template_name\n\n"; + print "[Fields]\n" . join(', ',@field_names) . "\n\n"; + + print "[Data]\n"; + foreach my $row (@field_data) { + foreach my $key (keys %$row) { + print "$key => " . $row->{$key} . "\n"; + } + print "\n"; + } + } + case 'gaf' { + print "[Data]\n"; + + for my $key (keys %annotation_universals) { + print "$key: " . $annotation_universals{$key} . "\n"; + } + print "\n"; + + print "[Annotations]\n" . Dumper(\%annotations) . "\n\n"; } - print "\n"; } print "\n"; } @@ -302,7 +493,7 @@ sub transform_generic # set root element, source page and elements # (temp set of page title until moved to import extension) - $xml->{Pages}{Page} = {Title => "Source:$source_title_seed"}; + $xml->{Pages}{Page} = {Title => "Source:$SOURCE_TITLE_SEED"}; $curr_node = $xml->{Pages}{Page}; $curr_node->{Template} = {Name => 'Source'}; @@ -323,7 +514,7 @@ sub transform_generic if ($debug) { print "Current node: " . $curr_node->path . "\n"; } - my $next_page_title_id = $annot_title_seed; + my $next_page_title_id = $ANNOT_TITLE_SEED; # iterate through the data foreach my $row (@field_data) { @@ -388,7 +579,7 @@ sub transform_generic # assign the relevant provenance field data $curr_prov_node->{Field}[0] = {Name => 'Source'}; - $curr_prov_node->{Field}[0]->content(0,"Source:$source_title_seed"); + $curr_prov_node->{Field}[0]->content(0,"Source:$SOURCE_TITLE_SEED"); $curr_prov_node->{Field}[1] = {Name => 'Source Accession ID'}; $curr_prov_node->{Field}[1]->content(0,$curr_accession_id); $curr_prov_node->{Field}[2] = {Name => 'Source Template'}; @@ -412,6 +603,31 @@ sub transform_generic # --------------------------------------------------------------------------- sub transform_gaf { + # + + # my $parser = new XML::DOM::Parser; + # my $doc = $parser->parsefile ("file.xml"); + + # # print all HREF attributes of all CODEBASE elements + # my $nodes = $doc->getElementsByTagName ("CODEBASE"); + # my $n = $nodes->getLength; + + # for (my $i = 0; $i < $n; $i++) + # { + # my $node = $nodes->item ($i); + # my $href = $node->getAttributeNode ("HREF"); + # print $href->getValue . "\n"; + # } + + # # Print doc file + # $doc->printToFile ("out.xml"); + + # # Print to string + # print $doc->toString; + + # # Avoid memory leaks - cleanup circular references for garbage collection + # $doc->dispose; + $output_data = "hullo, gaf"; } -- 2.34.1