/){ my $type=$1; my $targets1=$2; my $targets2=$3; $targets1 =~ s/^[\s]+//; $targets1 =~ s/[\s]+$//; $targets2 =~ s/^[\s]+//; $targets2 =~ s/[\s]+$//; if($celexid eq ""){ # print $Fout $line,"\n"; } else{ $line =~ s/[\s]*\/>$/>/; print $Fout $line,"\n"; print $Fout &addTextLanguage($lg1, $targets1, 1); print $Fout &addTextLanguage($lg2, $targets2, 2); print $Fout "<\/link>\n"; } next; } } if($options{'outDir'} ne ""){ close($Fout); } } sub addTextLanguage{ my ($lg, $targets1, $n)=@_; my $string = ""; my @targets1 = split(/[\s]+/,$targets1); if(scalar @targets1 > 0){ $string .= ""; foreach my $t (@targets1){ if(scalar @targets1 > 1){ $string .= "

"; } $string .= $text{$lg}->{s}->[$t]; if(scalar @targets1 > 1){ $string .= "

"; } } $string .= "<\/s$n>\n"; } else{ $string .= "\n"; } return $string; } sub getTextInfoFromXmlFile { my($lg,$celexid, $docid) = @_; my $txtInfo={}; my $year=""; if($celexid =~ /^[0-9]((19|20)[0-9][0-9])/){ $year=$1; } $txtInfo->{celex}=$celexid; $txtInfo->{s} = []; my $fileName=$options{acquisDir}."/".$lg."/".$year."/".$docid.".xml"; # print "Opening file...",$fileName,"\n"; # open(my $F, "<:encoding(utf8)", $fileName) || do{warn "Error when reading $fileName: $!"; return();}; open(my $F, "<:encoding(utf8)", $fileName) || die "Problems opeining file $fileName: $!\n"; while (my $line = <$F>) { # print "LINE:",$line; if($line =~ /

((.|\n|\r)*)<\/p>/i) { # print $1,"\t", $2,"\n"; $txtInfo->{s}->[$1]=$2; } } close $F; return $txtInfo; } __END__ =head1 NAME getAlignmentWithText.pl - program that add the text to the alignment files... =head1 SYNOPSIS perl getAlignmentWithText.pl -acquisDir "JRC-Acquis_corpus_folder" jrc-en-fr.xml >en-fr_alignedCorpus_withText.xml To select only the document from a list of celex codes : perl getAlignmentWithText.pl -acquisDir "JRC-Acquis_corpus_folder" -selectionList "file_withCelexCode" jrc-en-fr.xml >en-fr_alignedCorpus_withText.xml To process more files use an output folder as following: perl getAlignmentWithText.pl -acquisDir "JRC-Acquis_corpus_folder" -selectionList "file_withCelexCode" -outDir "Output_folder" jrc-en-fr.xml >en-fr_alignedCorpus_withText.xml The new files wll have the name composed by the name of the input file (without extension) followed by "_withText" followed by the extension (i.e. jrc-en-fr_withText.xml) =head1 DESCRIPTION Outputs an aligned corpus, containing documents in the following format: ... ....header....

19 paragraph links:

Décision du Comité mixte de l'EEE DECIZIA COMITETULUI MIXT AL SEE no 163/2002 nr. 163/2002 du 6 décembre 2002 din 6 decembrie 2002 .... The file is fully XML, we must use the UTF-8 encoding to handle all character sets (French-Greek for example). Example of use for Lithuanian-Swedish alignment: Before launching it make sure you have uncompressed (using gunzip command for example) the alignment file. gunzip jrc-lt-sv.xml.gz Then, you need to get and unpack the two corpora: tar xzf jrc-lt.tgz tar xzf jrc-sv.tgz Then you can launch this program using a perl5 interpreter: perl getAlignmentWithText.pl -acquisDir . jrc-lt-sv.xml > jrc-lt-sv_withText.xml =head1 COMMENTS We have deliberally chosen to parse the texts without an XML parser. The format of Xml texts is well known, and the script has to be as fast as possible to handle 8000 texts in less than 5 minutes. =head1 AUTHORS camelia.ignat@jrc.it, bruno.pouliquen@jrc.it =cut