/* This code is MODIFIED from the original source distribution, originally by
M. Collins (1999). The original documentation is contained below.
The modifications to this file are by Min-Yen Kan, and are also distributed
under GNU GPL license, see below or the GNU GPL License included with the
distribution.
The modifications enable the parser to work as a daemon, see the distributed
README-daemonCollins.html for details.
*/
/* This code is the statistical natural language parser described in
M. Collins. 1999. Head-Driven
Statistical Models for Natural Language Parsing. PhD Dissertation,
University of Pennsylvania.
Copyright (C) 1999 Michael Collins
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <assert.h>
#include "grammar.h"
#define GDEBUG 1
void init_grammar();
void init_lexicons();
void read_lexicon(FILE *file);
void read_grm(FILE *file);
void read_nonterminals(FILE *file);
void read_nonterminals2(FILE *file);
void read_grammar(char *filename)
{
char buffer[1000];
FILE *file;
init_lexicons();
if(GDEBUG) fprintf(stderr,"Initialised lexicons\n");
init_grammar();
if(GDEBUG) fprintf(stderr,"Initialised grammar\n");
strcpy(buffer,filename);
strcat(buffer,".nts");
file = fopen(buffer,"r");
read_nonterminals(file);
file = fopen(buffer,"r");
read_nonterminals2(file);
if(GDEBUG) fprintf(stderr,"Loaded non-terminals\n");
strcpy(buffer,filename);
strcat(buffer,".lexicon");
file = fopen(buffer,"r");
read_lexicon(file);
if(GDEBUG) fprintf(stderr,"Loaded lexicon\n");
strcpy(buffer,filename);
strcat(buffer,".grm");
file = fopen(buffer,"r");
read_grm(file);
if(GDEBUG) fprintf(stderr,"Loaded grammar\n");
}
void init_grammar()
{
int i,j,k;
for(i=0;i<GMAXNTS;i++)
for(j=0;j<GMAXNTS;j++)
for(k=0;k<GMAXNTS;k++)
tablep[i][j][k] = tablef[i][j][k] = 0;
for(i=0;i<GMAXNTS;i++)
unary_nums[i] = 0;
for(i=0;i<GMAXNTS;i++)
for(j=0;j<GMAXNTS;j++)
lsubcats_counts[i][j] = rsubcats_counts[i][j] = 0;
}
void init_lexicons()
{
int i,j;
for(i=0;i<GMAXWORDS;i++)
fwords[i] = GUNKNOWN;
for(i=0;i<GMAXWORDS;i++)
for(j=0;j<GMAXNTS;j++)
tagdict[i][j] = 0;
make_lex(200003,&wordlex);
make_lex(1003,&nt_lex);
TRACEWORD = add_word("*TRACE*",&wordlex);
fwords[TRACEWORD] = 1;
}
void read_nonterminals(FILE *file)
{
char buffer[1000];
int n;
while(fscanf(file,"%s",buffer)!=EOF)
{
n=add_word(buffer,&nt_lex);
assert(n>=0);
}
TRACENT = find_word("NP-A-g",&nt_lex);
assert(TRACENT>0);
TRACETAG = find_word("NN",&nt_lex);
assert(TRACETAG>0);
}
/*this reads the non-terminal strings, and sets up the argmap files*/
void read_nonterminals2(FILE *file)
{
int i,j,flag;
int len;
char buffer[1200];
char nt_line[100];
numnts=0;
while(fgets(nt_line,100,file)!= NULL)
{
sscanf(nt_line,"%s",&nts[numnts]);
numnts++;
}
/*now set up the various argmaps*/
for(i=0;i<GMAXNTS;i++)
argmap[i]=i;
for(i=0;i<GMAXNTS;i++)
hasarg[i]=0;
for(i=0;i<GMAXNTS;i++)
gapmap[i]=i;
for(i=0;i<GMAXNTS;i++)
hasgap[i]=0;
for(i=0;i<numnts;i++)
{
len=strlen(nts[i]);
strcpy(buffer,nts[i]);
flag=0;
for(j=len;j>=0;j--)
if(buffer[j]=='-')
{
buffer[j]='\0';
flag=1;
if(buffer[j+1]=='A')
hasarg[i]=1;
}
if(flag==1)
for(j=0;j<numnts;j++)
if(strcmp(buffer,nts[j])==0)
argmap[i]=j;
}
for(i=0;i<numnts;i++)
{
len=strlen(nts[i]);
if(nts[i][len-1]=='g' &&
nts[i][len-2]=='-')
{
hasgap[i]=1;
strcpy(buffer,nts[i]);
buffer[len-2]='\0';
for(j=0;j<numnts;j++)
if(strcmp(buffer,nts[j])==0)
gapmap[i]=j;
}
}
for(i=0;i<numnts;i++)
if(strcmp(nts[i],"Ss-A-g")==0)
{
for(j=0;j<numnts;j++)
if(strcmp("S-A",nts[j])==0)
gapmap[i]=j;
for(j=0;j<numnts;j++)
if(strcmp("S",nts[j])==0)
argmap[i]=j;
}
}
void read_lexicon(FILE *file)
{
char word[1000],tag[1000];
int fw;
int wn,tn;
while(fscanf(file,"%s %s %d",word,tag,&fw)!=EOF)
{
wn = add_word(word,&wordlex);
assert(wn>=0);
tn = find_word(tag,&nt_lex);
assert(tn>=0);
if(fw)
fwords[wn] = wn;
else
fwords[wn] = GUNKNOWN;
tagdict[wn][tn] = 1;
}
}
void read_grm(FILE *file)
{
char buffer[1000];
char b1[1000],b2[1000],b3[1000];
int n1,n2,n3;
while(fscanf(file,"%s",buffer)!=EOF)
{
if(strcmp(buffer,"L")==0)
{
fscanf(file,"%s %s %s",b1,b2,b3);
n1 = find_word(b1,&nt_lex);
assert(n1>=0);
n2 = find_word(b2,&nt_lex);
assert(n2>=0);
n3 = find_word(b3,&nt_lex);
assert(n3>=0);
tablep[n1][n2][n3] = 1;
}
else if(strcmp(buffer,"R")==0)
{
fscanf(file,"%s %s %s",b1,b2,b3);
n1 = find_word(b1,&nt_lex);
assert(n1>=0);
n2 = find_word(b2,&nt_lex);
assert(n2>=0);
n3 = find_word(b3,&nt_lex);
assert(n3>=0);
tablef[n1][n2][n3] = 1;
}
else if(strcmp(buffer,"U")==0)
{
fscanf(file,"%s %s",b1,b2);
n1 = find_word(b1,&nt_lex);
assert(n1>=0);
n2 = find_word(b2,&nt_lex);
assert(n2>=0);
unaries[n2][unary_nums[n2]] = n1;
unary_nums[n2]++;
}
else if(strcmp(buffer,"X")==0)
{
fscanf(file,"%s %s %d",b1,b2,&n3);
n1 = find_word(b1,&nt_lex);
assert(n1>=0);
n2 = find_word(b2,&nt_lex);
assert(n2>=0);
lsubcats[n1][n2][ lsubcats_counts[n1][n2] ] = n3;
lsubcats_counts[n1][n2]++;
}
else if(strcmp(buffer,"Y")==0)
{
fscanf(file,"%s %s %d",b1,b2,&n3);
n1 = find_word(b1,&nt_lex);
assert(n1>=0);
n2 = find_word(b2,&nt_lex);
assert(n2>=0);
rsubcats[n1][n2][ rsubcats_counts[n1][n2] ] = n3;
rsubcats_counts[n1][n2]++;
}
else assert(0);
}
}
int isverb(int tag)
{
if(tag>=PVB&&tag<=PVBZ)
return 1;
return 0;
}