#!/bin/csh -f # This is the file we use to make the serialized grammars for the parser. # If you are on the Stanford NLP machines, you can use it to remake the # serialized parsers (such as when there have been incompatible software # changes). Don't forget to klog first so you can access the AFS corpora. # # If you are not on the Stanford NLP machines, then the script won't work # for you as is, since it contains hard-coded paths to various treebanks. # But it may still be useful to inspect it to see what options we used to # generate the various supplied grammars. # # NOTE: Output files in this script should ALWAYS use relative paths, so # that you can copy this script and run it in a different directory and # it will write output files there. # # usage: # cd /u/nlp/data/lexparser # to have files output in "usual" location # kinit xxx@stanford.edu ; aklog -c ir.stanford.edu # $JAVANLP_HOME/projects/core/scripts/lexparser/makeSerialized.csh # ## Uncomment this bit to run it with older parser version # setenv CLASSPATH /u/nlp/distrib/lexparser-2004-03-24/javanlp.jar: if ( ! $?JAVANLP_HOME) then echo 'JAVANLP_HOME is not set' echo 'Add a line like setenv JAVANLP_HOME $HOME/javanlp to your environment' exit endif set wsjptb=/afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj # maybe use /scr/corpora/ldc/1999/LDC99T42/parsed/mrg/wsj instad # now ctb6 set ctb=/afs/ir/data/linguistic-data/Chinese-Treebank/6/data/utf8/bracketed # now ctb7! # This is using the train/dev/test ranges recommended in the released files for ctb7 # train: # [(81, 325), (400, 454), (500, 554), (590, 596), (600, 885), (900, 900), (1001, 1017), (1019, 1019), (1021, 1035), (1037, 1043), (1045, 1059), (1062, 1071), (1073, 1078), (1100, 1117), (1130, 1131), (1133, 1140), (1143, 1147), (1149, 1151), (2000, 2139), (2160, 2164), (2181, 2279), (2311, 2549), (2603, 2774), (2820, 3079)] # dev: # [(41, 80), (1120, 1129), (2140, 2159), (2280, 2294), (2550, 2569), (2775, 2799)] # test: # [(1, 40), (901, 931), (1018, 1018), (1020, 1020), (1036, 1036), (1044, 1044), (1060, 1061), (1072, 1072), (1118, 1119), (1132, 1132), (1141, 1142), (1148, 1148), (2165, 2180), (2295, 2310), (2570, 2602), (2800, 2819)] set ctb7train=/u/nlp/data/chinese/ctb7/train.mrg set ctb7test=/u/nlp/data/chinese/ctb7/test.mrg set negra=/u/nlp/data/GermanACL08/negra/penn-format-train-dev-test set ancoraTrain=/u/nlp/data/spanish/ancora/ancora.train set ancoraTest=/u/nlp/data/spanish/ancora/ancora.test set host=`hostname | cut -d. -f1` if ( ! -r $wsjptb) then echo "Can't read WSJ PTB. Maybe you forgot to klog??" exit endif mv -f serializedParsers.log serializedParsers.bak uptime > serializedParsers.log echo "Classpath is $CLASSPATH" >> serializedParsers.log # English WSJ 2-21 PCFG binary and text grammars ( echo "Running wsjPCFG (goodPCFG) on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -saveToSerializedFile wsjPCFG.ser.gz -saveToTextFile wsjPCFG.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log # English noTagSplit no rule compaction PCFG text grammar ( echo "Running wsjPCFG-noTagSplit-noCompact on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noTagSplit -saveToTextFile wsjPCFG-noTagSplit.txt -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log # English WSJ 2-21 Factored binary ## Not yet clear that goodFactored is better than -ijcai03 -- not on dev set # ( echo "Running wsjFactored (goodFactored) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDAtsv" -goodFactored -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log ( echo "Running wsjFactored (ijcai03 correctTags) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -correctTags -saveToSerializedFile wsjFactored.ser.gz -saveToTextFile wsjFactored.txt -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log ( echo "Running wsjFactored (ijcai03 replication) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -printStates -compactGrammar 0 -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log ## "General English" models # english{Factored|PCFG} is currently trained on: # - WSJ sections 1-21 # - Genia as reformatted by Andrew Clegg, his training split # - 2 English Chinese Translation Treebank and 3 English Arabic Translation # Treebank files backported to the original treebank annotation standards # (by us) # - 95 sentences parsed by us (mainly questions and imperatives; a few from # recent newswire). # /u/nlp/data/genia/sentences_cleaned.tree # was using: /afs/ir/data/linguistic-data/Treebank/Treebank3Stanford/parsed/mrg/wsj # "General English" Factored binary ( echo "Running englishFactored (from treebank) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -ijcai03 -saveToSerializedFile englishFactored.ser.gz -maxLength 40 -train $wsjptb 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank ${wsjptb}/22 2200-2219 ) >>& ./serializedParsers.log # "General English" PCFG binary ( echo "Running englishPCFG (from treebank) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -saveToSerializedFile englishPCFG.ser.gz -maxLength 40 -train ${wsjptb} 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank ${wsjptb}/22 2200-2219 ) >>& ./serializedParsers.log # "General English" PCFG, case insensitive, binary ( echo "Running caseless englishPCFG (from treebank) on $host" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.LowercaseAndAmericanizeFunction -evals factDA,tsv -goodPCFG -saveToSerializedFile englishPCFG.caseless.ser.gz -maxLength 40 -train ${wsjptb} 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank ${wsjptb}/22 2200-2219 ) >>& ./serializedParsers.log # English WSJ 2-21 PCFG simplified grammar # This dumbed down parser is used by the RNN parser. # See /scr/nlp/data/dvparser for more details. ( echo "Running wsj pcfg (simplified for use in the RNN parser) on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile wsjPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 200-2199 -testTreebank /afs/ir/data/linguistic-data/Treebank/3/parsed/mrg/wsj 2200-2219 ) >>& ./serializedParsers.log # English with extras PCFG simplified grammar # This dumbed down parser is used by the RNN parser. # See /scr/nlp/data/dvparser for more details. ( echo "Running english pcfg (simplified for use in the RNN parser) on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -wordFunction edu.stanford.nlp.process.AmericanizeFunction -evals "factDA,tsv" -goodPCFG -noRightRec -dominatesV 0 -baseNP 0 -saveToSerializedFile englishPCFG.nocompact.simple.ser.gz -maxLength 40 -compactGrammar 0 -train ${wsjptb} 100-2199 -train2 /u/nlp/data/lexparser/extraTrain 1-4000,9000-9099 0.5 -taggedFiles tagSeparator=_,/u/nlp/data/pos-tagger/english/train-tech-english -testTreebank ${wsjptb}/22 2200-2219 ) >>& ./serializedParsers.log # Xinhua Mainland Chinese PCFG binary ( echo "Running xinhuaPCFG on $host" ; time java -mx800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -saveToSerializedFile xinhuaPCFG.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log # new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041 # newer train list (Galen and Huihsin): 026-270,301-499,600-999 # this is all Xinhua minus Stanford devel and Bikel test # Xinhua Mainland Chinese Factored binary ( echo "Running xinhuaFactored on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -acl03chinese -scTags -saveToSerializedFile xinhuaFactored.ser.gz -maxLength 40 -train $ctb 026-270,301-499,600-999 -test $ctb 001-025 ) >>& ./serializedParsers.log # Mixed dialect Chinese on lots of data (with chineseFactored) ( echo "Running chineseFactored on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -saveToSerializedFile chineseFactored.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log # new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041 # newer train list (Galen and Huihsin): 026-270,301-499,600-999 # this is all Xinhua minus Stanford devel and Bikel test # CTB files 001-499, 555-589,597-1000 are from newswire of # XinHua. # Files 500-554 are Information Services Department of HKSAR. # Files 590-596 and 1001-1151 are Sinorama articles, more of literature # nature and from Taiwan. # Files 2000-3145 are ACE broadcast news (from where?). We only use a few for now. # Mixed dialect Chinese PCFG on lots of data ( echo "Running chinesePCFG on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chinesePCFG -useUnicodeType -saveToSerializedFile chinesePCFG.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log # new train list (Galen and Huihsin): 026-270,301-499,555-589,597-1041 # newer train list (Galen and Huihsin): 026-270,301-499,600-999 # this is all Xinhua minus Stanford devel and Bikel test # Chinese parser for unsegmented Chinese ( echo "Running xinhuaFactoredSegmenting on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -segmentMarkov -train $ctb 26-270,301-499,600-999 -sctags -acl03chinese -saveToSerializedFile xinhuaFactoredSegmenting.ser.gz ) >>& ./serializedParsers.log java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -encoding utf-8 xinhuaFactoredSegmenting.ser.gz /u/nlp/data/lexparser/chinese-onesent-unseg-utf8.txt >>& ./serializedParsers.log # It used to be the case that explicitly saying tLPP on command line was # needed for file encoding. But it has been fixed. # ( echo "Running xinhuaFactored from serialized check on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log # This now works ( echo "Running xinhuaFactored from serialized (check without specifying -tLPP) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -maxLength 40 -loadFromSerializedFile xinhuaFactored.ser.gz -test $ctb 001-025 ) >>& ./serializedParsers.log ( echo "Running chinesePCFG (simplified for use in the RNN parser) on $host" ; time java -mx4g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.ChineseTreebankParserParams -chineseFactored -PCFG -hMarkov 1 -nomarkNPconj -compactGrammar 0 -saveToSerializedFile chinesePCFG.simple.ser.gz -maxLength 40 -train $ctb7train -test $ctb7test ) >>& ./serializedParsers.log # German Factored binary from Negra (version 2) # $negra 3 is the dev set ( echo "Running germanFactored on $host" ; time java -mx5g edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 2 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanFactored.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log # German PCFG from Negra (version 2) ( echo "Running germanPCFG on $host" ; time java -mx2g edu.stanford.nlp.parser.lexparser.LexicalizedParser -v -evals tsv -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -PCFG -hMarkov 1 -vMarkov 2 -vSelSplitCutOff 300 -uwm 1 -unknownSuffixSize 1 -maxLength 40 -nodeCleanup 2 -saveToSerializedFile germanPCFG.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log # German Dependency parser # This requires normalizing the dependency output to strip boundary symbol. # ( echo "Running germanDep on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.NegraPennTreebankParserParams -dep -hMarkov 1 -maxLength 40 -saveToSerializedFile germanDep.ser.gz -train $negra 1 -test $negra 3 ) >>& ./serializedParsers.log ######### # SPANISH ######### # Spanish PCFG ( echo "Running spanishPCFG on $host" ; time java -mx6g edu.stanford.nlp.parser.lexparser.LexicalizedParser -v -evals "factDA,tsv" -tLPP edu.stanford.nlp.parser.lexparser.SpanishTreebankParserParams -PCFG -vMarkov 3 -uwm 1 -vSelSplitCutoff 100 -rightRec -train $ancoraTrain -test $ancoraTest -saveToSerializedFile spanishPCFG.ser.gz ) >>& ./serializedParsers.log ######## # The languages below this line use TreebankPreprocessor for pre-processing prior to training ######## set mydir=`pwd` set data_dir=/u/nlp/data/lexparser/trees set tree_pipe=$JAVANLP_HOME/projects/core/scripts/lexparser/run-tb-preproc set train_sh=$JAVANLP_HOME/projects/core/scripts/lexparser/lexparser-lang-train-test.sh if( ! -e $data_dir ) then mkdir $data_dir endif ######## # ARABIC ######## set ar_data_dir=$data_dir/Arabic set ar_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/arabic/pipeline/configurations/atb-latest.conf set ar_train_args="Arabic 40 $ar_data_dir/2-Unvoc-All.utf8.txt $ar_data_dir/2-Unvoc-Dev.utf8.txt BASELINE_ar -saveToSerializedFile arabicFactored.ser.gz" if( ! -e $ar_data_dir ) then mkdir $ar_data_dir endif echo Running $tree_pipe -p $ar_data_dir -v $ar_conf_file >>& ./serializedParsers.log $tree_pipe -p $ar_data_dir -v $ar_conf_file >& $ar_data_dir/build.log echo "" >>& ./serializedParsers.log ( echo "Training Arabic Factored grammar using baseline feature set" ; time $train_sh $ar_train_args ) >>& ./serializedParsers.log ######## # FRENCH ######## set fr_data_dir=$data_dir/FrenchCC set fr_conf_file=$JAVANLP_HOME/projects/core/src/edu/stanford/nlp/international/french/pipeline/configurations/ftb-latest.conf set fr_train_args="French 40 $fr_data_dir/FTB-All.utf8.txt $fr_data_dir/FTB-Dev.utf8.txt BASELINE_fr -saveToSerializedFile frenchFactored.ser.gz" if( ! -e $fr_data_dir ) then mkdir $fr_data_dir endif echo Running $tree_pipe -p $fr_data_dir -v $fr_conf_file >>& ./serializedParsers.log $tree_pipe -p $fr_data_dir -v $fr_conf_file >& $fr_data_dir/build.log echo "" >>& ./serializedParsers.log echo time $train_sh $fr_train_args >>& ./serializedParsers.log ( echo "Training French Factored grammar using baseline feature set" ; time $train_sh $fr_train_args ) >>& ./serializedParsers.log ## English just to check parser code regression (not saved) ## Just for reference ( echo "Running wsjPCFG (acl03pcfg replication) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -acl03pcfg -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log ## See if same results from serialized parser ( echo "Running wsjFactored (ijcai03 from serialized) on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromSerializedFile wsjFactored.ser.gz -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log # ( echo "Running wsjFactored (ijcai03 with nodeprune) on $host" ; time java -mx1800m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -ijcai03 -v -compactGrammar 0 -nodePrune true -maxLength 40 -train $wsjptb 200-2199 -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log ## See if same results from text grammar parser ( echo "Running wsjFactored (ijcai03 from textGrammar) on $host" ; time java -mx1500m edu.stanford.nlp.parser.lexparser.LexicalizedParser -evals "factDA,tsv" -v -maxLength 40 -loadFromTextFile wsjFactored.txt -testTreebank $wsjptb 2200-2219 ) >>& ./serializedParsers.log uptime >> serializedParsers.log mv -f serializedParsersPerformance.last serializedParsersPerformance.2ndlast mv -f serializedParsersPerformance.current serializedParsersPerformance.last echo -n "Parser run by $USER on " > serializedParsersPerformance.current date >> serializedParsersPerformance.current grep 'N: 253\|N: 393\|Done testing on treebank\|Running \| summary ' serializedParsers.log >> serializedParsersPerformance.current echo >> serializedParsersPerformance.current echo >> serializedParsersPerformance.current cat serializedParsersPerformance.current >> serializedParsersPerformance.txt cp -f serializedParsers.last serializedParsers.2ndlast cp -f serializedParsers.current serializedParsers.last cp -f serializedParsers.log serializedParsers.current