345 lines
6.4 KiB
Plaintext
345 lines
6.4 KiB
Plaintext
%
|
|
% Context-sensitive mapping of PTB POS tags to
|
|
% Universal POS tags.
|
|
%
|
|
% Author: Sebastian Schuster
|
|
% Author: Christopher Manning
|
|
%
|
|
% The original Penn Treebank WSJ contains 45 POS tags (but almost certainly # for British pound currency is a bad idea!)
|
|
% {#=173, $=9,039, ''=8,658, ,=60,489, -LRB-=1,672, -RRB-=1,689, .=48,733, :=6,087, CC=29,462, CD=44,937, DT=101,190,
|
|
% EX=1,077, FW=268, IN=121,903, JJ=75,266, JJR=4,042, JJS=2,396, LS=64, MD=11,997, NN=163,935, NNP=114,053,
|
|
% NNPS=3,087, NNS=73,964, PDT=441, POS=10,801, PRP=21,357, PRP$=10,241, RB=38,197, RBR=2,175, RBS=555, RP=3,275,
|
|
% SYM=70, TO=27,449, UH=117, VB=32,565, VBD=37,493, VBG=18,239, VBN=24,865, VBP=15,377, VBZ=26,436, WDT=5,323,
|
|
% WP=2,887, WP$=219, WRB=2,625, ``=8,878}
|
|
%
|
|
% The Web Treebank corpus adds 6 tags, but doesn't have #, yielding 50 POS tags:
|
|
% ADD, AFX, GW, HYPH, NFP, XX
|
|
%
|
|
% OntoNotes 4.0 has 53 tags. It doesn't have # but adds: -LSB-, -RSB- [both mistakes!], ADD, AFX, CODE, HYPH, NFP,
|
|
% X [mistake!], XX.
|
|
%
|
|
%
|
|
% ------------------------------
|
|
% Context-sensitive mappings
|
|
%
|
|
% TO -> PART (in CONJP phrases)
|
|
@CONJP < TO=target < VB
|
|
|
|
relabel target PART
|
|
|
|
% TO -> PART
|
|
@VP < @VP < (/^TO$/=target <... {/.*/})
|
|
|
|
relabel target PART
|
|
|
|
% TO -> PART
|
|
@VP <: (/^TO$/=target <... {/.*/})
|
|
|
|
relabel target PART
|
|
|
|
% TO -> ADP (otherwise)
|
|
TO=target <... {/.*/}
|
|
|
|
relabel target ADP
|
|
|
|
% Don't do this, we are now treating these as copular constructions
|
|
% VB.* -> AUX (for passives where main verb is part of an ADJP)
|
|
%@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase|get|got|getting|gets|gotten)$/ ) < (@ADJP [ < VBN|VBD | < (@VP|ADJP < VBN|VBD) < CC ] )
|
|
%
|
|
%relabel target AUX
|
|
%
|
|
% VB.* -> AUX (for cases with fronted main VPs)
|
|
@SINV < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ ) $-- (@VP < VBD|VBN))
|
|
|
|
relabel target AUX
|
|
|
|
% VB.* -> AUX (another, rarer case of fronted VPs)
|
|
@SINV < (@VP < (@VP < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ )) $-- (@VP < VBD|VBN))
|
|
|
|
relabel target AUX
|
|
|
|
% VB.* -> AUX (passive, case 2)
|
|
%SQ|SINV < (/^VB/=target < /^(?i:am|is|are|r|be|being|'s|'re|'m|was|were|been|s|ai|m|art|ar|wase)$/ $++ (VP < VBD|VBN))
|
|
%
|
|
%relabel target AUX
|
|
|
|
% VB.* -> AUX (active, case 1)
|
|
VP < VP < (/^VB.*$/=target <: /^(?i:will|have|can|would|do|is|was|be|are|has|could|should|did|been|may|were|had|'ll|'ve|does|am|might|ca|'m|being|'s|must|'d|'re|wo|shall|get|ve|s|got|r|m|getting|having|d|re|ll|wilt|v|of|my|nt|gets|du|wud|woud|with|willl|wil|wase|shoul|shal|`s|ould|-ll|most|made|hvae|hav|cold|as|art|ai|ar|a)$/)
|
|
|
|
relabel target AUX
|
|
|
|
% VB -> AUX (active, case 2)
|
|
@SQ|SINV < (/^VB/=target $++ /^(?:VP)/ <... {/.*/})
|
|
|
|
relabel target AUX
|
|
|
|
% VB.* -> VERB
|
|
/^VB.*/=target <... {/.*/}
|
|
|
|
relabel target VERB
|
|
|
|
% IN -> SCONJ (subordinating conjunctions)
|
|
/^SBAR(-[^ ]+)?$/ < (IN=target $++ @S|FRAG|SBAR|SINV <... {/.*/})
|
|
|
|
relabel target SCONJ
|
|
|
|
% IN -> SCONJ (subordinating conjunctions II)
|
|
@PP < (IN=target $+ @SBAR|S)
|
|
|
|
relabel target SCONJ
|
|
|
|
% IN -> ADP (otherwise)
|
|
IN=target < __
|
|
|
|
relabel target ADP
|
|
|
|
% NN -> SYM (in case of the percent sign)
|
|
NN=target <... {/\\%/}
|
|
|
|
relabel target SYM
|
|
|
|
% fused det-noun pronouns -> PRON
|
|
NN=target < (/^(?i:(somebody|something|someone|anybody|anything|anyone|everybody|everything|everyone|nobody|nothing))$/)
|
|
|
|
relabel target PRON
|
|
|
|
% NN -> NOUN (otherwise)
|
|
NN=target <... {/.*/}
|
|
|
|
relabel target NOUN
|
|
|
|
% NFP -> PUNCT (in case of possibly repeated hyphens, asterisks or tildes)
|
|
NFP=target <... {/^(~+|\*+|\-+)$/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% NFP -> SYM (otherwise)
|
|
NFP=target <... {/.*/}
|
|
|
|
relabel target SYM
|
|
|
|
% RB -> PART when it is verbal negation (not or its reductions)
|
|
@VP|SINV|SQ|FRAG|ADVP < (RB=target < /^(?i:not|n't|nt|t|n)$/)
|
|
|
|
relabel target PART
|
|
|
|
% Otherwise RB -> ADV
|
|
RB=target <... {/.*/}
|
|
|
|
relabel target ADV
|
|
|
|
% DT -> PRON (pronominal this/that/these/those)
|
|
@NP <: (DT=target < /^(?i:th(is|at|ose|ese))$/)
|
|
|
|
relabel target PRON
|
|
|
|
%DT -> DET
|
|
DT=target < __
|
|
|
|
relabel target DET
|
|
|
|
% WDT -> PRON (pronominal that/which)
|
|
@WHNP|NP <: (WDT=target < /^(?i:(that|which))$/)
|
|
|
|
relabel target PRON
|
|
|
|
% WDT->SCONJ (incorrectly tagged subordinating conjunctions)
|
|
@SBAR < (WDT=target < /^(?i:(that|which))$/)
|
|
|
|
relabel target SCONJ
|
|
|
|
% WDT -> DET
|
|
WDT=target <... {/.*/}
|
|
|
|
relabel target DET
|
|
|
|
% ------------------------------
|
|
% 1 to 1 mappings
|
|
%
|
|
%
|
|
% CC -> CCONJ
|
|
CC=target <... {/.*/}
|
|
|
|
relabel target CCONJ
|
|
|
|
% CD -> NUM
|
|
CD=target <... {/.*/}
|
|
|
|
relabel target NUM
|
|
|
|
% EX -> PRON
|
|
EX=target <... {/.*/}
|
|
|
|
relabel target PRON
|
|
|
|
% FW -> X
|
|
FW=target <... {/.*/}
|
|
|
|
relabel target X
|
|
|
|
% JJ.* -> ADJ
|
|
/^JJ.*$/=target < __
|
|
|
|
relabel target ADJ
|
|
|
|
% LS -> X
|
|
LS=target <... {/.*/}
|
|
|
|
relabel target X
|
|
|
|
% MD -> AUX
|
|
MD=target <... {/.*/}
|
|
|
|
relabel target AUX
|
|
|
|
% NNS -> NOUN
|
|
NNS=target <... {/.*/}
|
|
|
|
relabel target NOUN
|
|
|
|
% NNP -> PROPN
|
|
NNP=target <... {/.*/}
|
|
|
|
relabel target PROPN
|
|
|
|
% NNPS -> PROPN
|
|
NNPS=target <... {/.*/}
|
|
|
|
relabel target PROPN
|
|
|
|
% PDT -> DET
|
|
PDT=target <... {/.*/}
|
|
|
|
relabel target DET
|
|
|
|
% POS -> PART
|
|
POS=target <... {/.*/}
|
|
|
|
relabel target PART
|
|
|
|
% PRP -> PRON
|
|
PRP=target <... {/.*/}
|
|
|
|
relabel target PRON
|
|
|
|
% PRP$ -> PRON
|
|
/^PRP\$$/=target <... {/.*/}
|
|
|
|
relabel target PRON
|
|
|
|
% RBR -> ADV
|
|
RBR=target <... {/.*/}
|
|
|
|
relabel target ADV
|
|
|
|
% RBS -> ADV
|
|
RBS=target <... {/.*/}
|
|
|
|
relabel target ADV
|
|
|
|
% RP -> ADP
|
|
RP=target <... {/.*/}
|
|
|
|
relabel target ADP
|
|
|
|
% UH -> INTJ
|
|
UH=target <... {/.*/}
|
|
|
|
relabel target INTJ
|
|
|
|
% WP -> PRON
|
|
WP=target <... {/.*/}
|
|
|
|
relabel target PRON
|
|
|
|
% WP$ -> PRON
|
|
/^WP\$$/=target <... {/.*/}
|
|
|
|
relabel target PRON
|
|
|
|
% WRB -> ADV
|
|
WRB=target <... {/.*/}
|
|
|
|
relabel target ADV
|
|
|
|
% `` -> PUNCT
|
|
/^``$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% '' -> PUNCT
|
|
/^''$/=target < __
|
|
|
|
relabel target PUNCT
|
|
|
|
% ( -> PUNCT
|
|
/^\($/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% ) -> PUNCT
|
|
/^\)$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% -LRB- -> PUNCT
|
|
/^-LRB-$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% -RRB- -> PUNCT
|
|
/^-RRB-$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% , -> PUNCT
|
|
/^,$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% . -> PUNCT
|
|
/^\.$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% : -> PUNCT
|
|
/^:$/=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% HYPH -> PUNCT
|
|
HYPH=target <... {/.*/}
|
|
|
|
relabel target PUNCT
|
|
|
|
% # -> SYM
|
|
/^#$/=target <... {/.*/}
|
|
|
|
relabel target SYM
|
|
|
|
% $ -> SYM. Also note that there is a no-op rule of SYM -> SYM!
|
|
/^\$$/=target <... {/.*/}
|
|
|
|
relabel target SYM
|
|
|
|
% ADD -> X
|
|
ADD=target <... {/.*/}
|
|
|
|
relabel target X
|
|
|
|
% AFX -> X
|
|
AFX=target <... {/.*/}
|
|
|
|
relabel target X
|
|
|
|
% GW -> X
|
|
GW=target <... {/.*/}
|
|
|
|
relabel target X
|
|
|
|
% XX -> X
|
|
XX=target <... {/.*/}
|
|
|
|
relabel target X
|
|
|