/* This code is MODIFIED from the original source distribution, originally by
M. Collins (1999). The original documentation is contained below.
The modifications to this file are by Min-Yen Kan, and are also distributed
under GNU GPL license, see below or the GNU GPL License included with the
distribution.
The modifications enable the parser to work as a daemon, see the distributed
README-daemonCollins.html for details.
*/
/* This code is the statistical natural language parser described in
M. Collins. 1999. Head-Driven
Statistical Models for Natural Language Parsing. PhD Dissertation,
University of Pennsylvania.
Copyright (C) 1999 Michael Collins
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <assert.h>
#include "prob.h"
#define START_NT 43
/*dependency parameters: D1 = generating nt,tag, D2 = generating word*/
#define D1PROBTYPE 0
#define D1OLEN 4
int D1BACKOFFS[] = {3,9,7,6};
#define D2PROBTYPE 1
#define D2OLEN 2
int D2BACKOFFS[] = {3,13,11,1};
int D2BACKOFFS_NOTAG[] = {2,13,11};
/*unary parameters */
#define UPROBTYPE 2
#define UOLEN 1
int UBACKOFFS[] = {3,4,2,1};
/*start parameters: S1 = generating nt,tag, S2 = generating word*/
#define S1PROBTYPE 3
#define S1OLEN 2
int S1BACKOFFS[] = {1,1};
#define S2PROBTYPE 4
#define S2OLEN 2
int S2BACKOFFS[] = {2,3,1};
int S2BACKOFFS_NOTAG[] = {1,3};
/*subcategorisation parameters */
#define SCPROBTYPE 5
#define SCOLEN 3
int SCBACKOFFS[] = {3,6,4,3};
/*prior parameters: P1 = generating nt,tag, P2 = generating word*/
#define P1PROBTYPE 6
#define P1OLEN 1
int P1BACKOFFS[] = {3,4,2,1};
#define P2PROBTYPE 7
#define P2OLEN 3
int P2BACKOFFS[] = {1,1};
/*coordination/punc parameters: CP1 generates tag, CP2 generates the word*/
#define CP1PROBTYPE 8
#define CP1OLEN 1
int CP1BACKOFFS[] = {3,9,6,1};
#define CP2PROBTYPE 9
#define CP2OLEN 2
int CP2BACKOFFS[] = {3,10,7,1};
int CP2BACKOFFS_NOTAG[] = {2,10,7};
/*gap parameters */
#define GPROBTYPE 10
#define GOLEN 1
int GBACKOFFS[] = {3,5,3,2};
/*make string for P(cm,t | context) */
void make_dep1_string(unsigned char *string,int wm,int tm,int cm,int wh,int th,int p,int ch,int dist,int subcat,int cc,int punc);
/*make string for P(wm | cm,t, context) */
void make_dep2_string(unsigned char *string,int wm,int tm,int cm,int wh,int th,int p,int ch,int dist,int subcat,int cc,int punc);
/*make string for P(ch | p,th,wh) */
void make_unary_string(unsigned char *string,int ch,int wh,int th,int p);
/*make string for P(ch,th | p==TOP) */
void make_s1_string(unsigned char *string,int ch,int wh,int th,int p);
/*make string for P(wh | ch,th,p==TOP) */
void make_s2_string(unsigned char *string,int ch,int wh,int th,int p);
/*make string for P(subcat | ch,th,p,dir) */
void make_subcat_string(unsigned char *string,int subcat,int ch,int wh,int th,int p,int dir);
/*make strings for P(ch,th | anything) P(wh | ch,th,anything)*/
void make_prior1_string(unsigned char *string,int ch,int wh,int th);
void make_prior2_string(unsigned char *string,int ch,int wh,int th);
/*make strings for P(ccword,cctag | ...) or P(pword,ptag | ...)
type is 0 for coordination 1 for punctuation */
void make_pcc1_string(unsigned char *string,int t,int p,int ch,int th,int wh,int cm,int tm,int wm,int type);
void make_pcc2_string(unsigned char *string,int w,int t,int p,int ch,int th,int wh,int cm,int tm,int wm,int type);
/*make string for P(gap | wh,ch,th,p) */
void make_gap_string(unsigned char *string,int gap,int ch,int wh,int th,int p);
void add_dependency_counts(int wm,int tm,int cm,int wh,int th,int p,int ch,int dist,int subcat,int cc,int wcc,int tcc,int punc,int wpunc,int tpunc,hash_table *hash)
{
unsigned char buffer[1000];
if(cm != STOPNT)
add_prior_counts(cm,wm,tm,hash);
wm = fwords[wm];
p = argmap[p];
ch = gapmap[ch];
make_dep1_string(buffer,wm,tm,cm,wh,th,p,ch,dist,subcat,cc,punc);
add_counts(buffer,D1OLEN,D1BACKOFFS,D1PROBTYPE,hash);
make_dep2_string(buffer,wm,tm,cm,wh,th,p,ch,dist,subcat,cc,punc);
add_counts(buffer,D2OLEN,D2BACKOFFS_NOTAG,D2PROBTYPE,hash);
if(cc==1)
{
make_pcc1_string(buffer,tcc,p,ch,th,wh,cm,tm,wm,0);
add_counts(buffer,CP1OLEN,CP1BACKOFFS,CP1PROBTYPE,hash);
make_pcc2_string(buffer,wcc,tcc,p,ch,th,wh,cm,tm,wm,0);
add_counts(buffer,CP2OLEN,CP2BACKOFFS_NOTAG,CP2PROBTYPE,hash);
}
if(punc==1)
{
make_pcc1_string(buffer,tpunc,p,ch,th,wh,cm,tm,wm,1);
add_counts(buffer,CP1OLEN,CP1BACKOFFS,CP1PROBTYPE,hash);
make_pcc2_string(buffer,wpunc,tpunc,p,ch,th,wh,cm,tm,wm,1);
add_counts(buffer,CP2OLEN,CP2BACKOFFS_NOTAG,CP2PROBTYPE,hash);
}
}
double get_dependency_prob(int wm,int tm,int cm,int wh,int th,int p,int ch,int dist,int subcat,int cc,int wcc,int tcc,int punc,int wpunc,int tpunc,hash_table *hash)
{
unsigned char buffer[1000];
double p1,p2,p3,p4;
wm = fwords[wm];
p = argmap[p];
ch = gapmap[ch];
make_dep1_string(buffer,wm,tm,cm,wh,th,p,ch,dist,subcat,cc,punc);
p1 = get_prob(buffer,D1OLEN,D1BACKOFFS,D1PROBTYPE,0,5,hash);
if(cm != STOPNT)
{
make_dep2_string(buffer,wm,tm,cm,wh,th,p,ch,dist,subcat,cc,punc);
p2 = get_prob(buffer,D2OLEN,D2BACKOFFS,D2PROBTYPE,0,5,hash);
}
else
p2 = 1;
if(cc==1)
{
make_pcc1_string(buffer,tcc,p,ch,th,wh,cm,tm,wm,0);
p3 = get_prob(buffer,CP1OLEN,CP1BACKOFFS,CP1PROBTYPE,0,5,hash);
make_pcc2_string(buffer,wcc,tcc,p,ch,th,wh,cm,tm,wm,0);
p3 *= get_prob(buffer,CP2OLEN,CP2BACKOFFS,CP2PROBTYPE,0,5,hash);
}
else
p3 = 1;
if(punc==1)
{
make_pcc1_string(buffer,tpunc,p,ch,th,wh,cm,tm,wm,1);
p4 = get_prob(buffer,CP1OLEN,CP1BACKOFFS,CP1PROBTYPE,0,5,hash);
make_pcc2_string(buffer,wpunc,tpunc,p,ch,th,wh,cm,tm,wm,1);
p4 *= get_prob(buffer,CP2OLEN,CP2BACKOFFS,CP2PROBTYPE,0,5,hash);
}
else
p4 = 1;
return log(p1*p2*p3*p4);
}
void add_prior_counts(int ch,int wh,int th,hash_table *hash)
{
unsigned char buffer[1000];
ch = gapmap[ch];
wh = fwords[wh];
make_prior1_string(buffer,ch,wh,th);
add_counts(buffer,P1OLEN,P1BACKOFFS,P1PROBTYPE,hash);
make_prior2_string(buffer,ch,wh,th);
add_counts(buffer,P2OLEN,P2BACKOFFS,P2PROBTYPE,hash);
}
double get_prior_prob(int ch,int wh,int th,hash_table *hash)
{
unsigned char buffer[1000];
double p1,p2;
ch = gapmap[ch];
wh = fwords[wh];
make_prior1_string(buffer,ch,wh,th);
p1=get_prob(buffer,P1OLEN,P1BACKOFFS,P1PROBTYPE,0,5,hash);
make_prior2_string(buffer,ch,wh,th);
p2=get_prob(buffer,P2OLEN,P2BACKOFFS,P2PROBTYPE,1,0,hash);
return log(p1*p2);
}
void add_unary_counts(int ch,int wh,int th,int p,hash_table *hash)
{
unsigned char buffer[1000];
add_prior_counts(ch,wh,th,hash);
if(p==START_NT)
{
wh = fwords[wh];
make_s1_string(buffer,ch,wh,th,p);
add_counts(buffer,S1OLEN,S1BACKOFFS,S1PROBTYPE,hash);
make_s2_string(buffer,ch,wh,th,p);
add_counts(buffer,S2OLEN,S2BACKOFFS_NOTAG,S2PROBTYPE,hash);
}
else
{
make_unary_string(buffer,ch,wh,th,p);
add_counts(buffer,UOLEN,UBACKOFFS,UPROBTYPE,hash);
}
}
double get_unary_prob(int ch,int wh,int th,int p,hash_table *hash)
{
unsigned char buffer[1000];
double p1,p2;
if(p==START_NT)
{
wh = fwords[wh];
make_s1_string(buffer,ch,wh,th,p);
p1=get_prob(buffer,S1OLEN,S1BACKOFFS,S1PROBTYPE,0,5,hash);
make_s2_string(buffer,ch,wh,th,p);
p2=get_prob(buffer,S2OLEN,S2BACKOFFS,S2PROBTYPE,0,5,hash);
return log(p1*p2);
}
else
{
make_unary_string(buffer,ch,wh,th,p);
return log(get_prob(buffer,UOLEN,UBACKOFFS,UPROBTYPE,0,5,hash));
}
}
/*subcat: dir=0 means left, dir=1 means right*/
void add_subcat_counts(int subcat,int ch,int wh,int th,int p,int dir,hash_table *hash)
{
unsigned char buffer[1000];
p=argmap[p];
ch=argmap[ch];
make_subcat_string(buffer,subcat,ch,wh,th,p,dir);
add_counts(buffer,SCOLEN,SCBACKOFFS,SCPROBTYPE,hash);
}
double get_subcat_prob(int subcat,int ch,int wh,int th,int p,int dir,hash_table *hash)
{
unsigned char buffer[1000];
p=argmap[p];
ch=argmap[ch];
make_subcat_string(buffer,subcat,ch,wh,th,p,dir);
return log(get_prob(buffer,SCOLEN,SCBACKOFFS,SCPROBTYPE,5,0,hash));
}
void add_gap_counts(int gap,int ch,int wh,int th,int p,hash_table *hash)
{
unsigned char buffer[1000];
p=argmap[p];
ch=argmap[ch];
make_gap_string(buffer,gap,ch,wh,th,p);
add_counts(buffer,GOLEN,GBACKOFFS,GPROBTYPE,hash);
}
double get_gap_prob(int gap,int ch,int wh,int th,int p,hash_table *hash)
{
unsigned char buffer[1000];
p=argmap[p];
ch=argmap[ch];
make_gap_string(buffer,gap,ch,wh,th,p);
return log(get_prob(buffer,GOLEN,GBACKOFFS,GPROBTYPE,5,0,hash));
}
/*make string for P(cm,t | context)
position element
3 tm
4 cm
5 punc
6 cc
7 p
8 ch
9 dist
10..12 subcat
13 th
14..15 wh
*/
void calc_newdist(int *newd,int d)
{
if(d>=100)
{
*newd = 100;
d-=100;
}
else
*newd=0;
if(d>=10)
{
if(DISTAFLAG)
*newd += 10;
d-=10;
}
if(d>=1)
{
if(DISTVFLAG)
*newd += 1;
}
}
void make_dep1_string(unsigned char *string,int wm,int tm,int cm,int wh,int th,int p,int ch,int dist,int subcat,int cc,int punc)
{
int newdist;
calc_newdist(&newdist,dist);
string[3] = (char) tm;
string[4] = (char) cm;
string[5] = (char) punc;
string[6] = (char) cc;
string[7] = (char) p;
string[8] = (char) ch;
string[9] = (char) newdist;
byte3_to_char(&string[10],subcat);
string[13] = (char) th;
byte2_to_char(&string[14],wh);
}
/*make string for P(wm | cm,t, context)
position element
3..4 wm
5 tm
6 cm
7 punc
8 cc
9 p
10 ch
11 dist
12..14 subcat
15 th
16..17 wh
*/
void make_dep2_string(unsigned char *string,int wm,int tm,int cm,int wh,int th,int p,int ch,int dist,int subcat,int cc,int punc)
{
int newdist;
calc_newdist(&newdist,dist);
byte2_to_char(&string[3],wm);
string[5] = (char) tm;
string[6] = (char) cm;
string[7] = (char) punc;
string[8] = (char) cc;
string[9] = (char) p;
string[10] = (char) ch;
string[11] = (char) newdist;
byte3_to_char(&string[12],subcat);
string[15] = (char) th;
byte2_to_char(&string[16],wh);
}
/*make string for P(wm | cm,t, context)
position element
3..4 wm
5 tm
*/
void add_tagword_counts(int wm,int tm,hash_table *hash)
{
unsigned char buffer[1000];
int bos[] = {1,1};
wm = fwords[wm];
byte2_to_char(&buffer[3],wm);
buffer[5] = (char) tm;
add_counts_level(buffer,D2OLEN,bos,3,D2PROBTYPE,hash);
add_counts_level(buffer,S2OLEN,bos,2,S2PROBTYPE,hash);
add_counts_level(buffer,CP2OLEN,bos,3,CP2PROBTYPE,hash);
}
void byte2_to_char(unsigned char *string,int n)
{
string[0] = (n & 255);
string[1] = n/256;
}
void byte3_to_char(unsigned char *string,int n)
{
string[0] = (n&255);
string[1] = ((n/256)&255);
string[2] = n/65536;
}
/*make string for P(ch | p,th,wh)
position element
3 ch
4 p
5 th
6..7 wh
*/
void make_unary_string(unsigned char *string,int ch,int wh,int th,int p)
{
string[3] = (char) ch;
string[4] = (char) p;
string[5] = (char) th;
byte2_to_char(&string[6],wh);
}
/*make string for P(ch,th | p==TOP)
position element
3 ch
4 th
5 TOP
*/
void make_s1_string(unsigned char *string,int ch,int wh,int th,int p)
{
string[3] = (char) ch;
string[4] = (char) th;
string[5] = (char) p;
}
/*make string for P(wh | ch,th, p==TOP)
position element
3..4 wh
5 th
6 ch
7 TOP
*/
void make_s2_string(unsigned char *string,int ch,int wh,int th,int p)
{
byte2_to_char(&string[3],wh);
string[5] = (char) th;
string[6] = (char) ch;
string[7] = (char) p;
}
/*make string for P(subcat | wh,ch,th, p,dir)
position element
3..5 subcat
6 ch
7 p
8 dir
9 th
10..11 wh
*/
void make_subcat_string(unsigned char *string,int subcat,int ch,int wh,int th,int p,int dir)
{
byte3_to_char(&string[3],subcat);
string[6] = (char) ch;
string[7] = (char) p;
string[8] = (char) dir;
string[9] = (char) th;
byte2_to_char(&string[10],wh);
}
/*make string for P(ch | wh,th)
position element
3 ch
4 0
5 th
6..7 wh
*/
void make_prior1_string(unsigned char *string,int ch,int wh,int th)
{
string[3] = (char) ch;
string[4] = 0;
string[5] = (char) th;
byte2_to_char(&string[6],wh);
}
/*make string for P(wh, th | p==anything)
position element
3..4 wh
5 th
6 0
*/
void make_prior2_string(unsigned char *string,int ch,int wh,int th)
{
byte2_to_char(&string[3],wh);
string[5] = (char) th;
string[6] = 0;
}
/* pcc1_string
position element
3 t
4 type
5 p
6 ch
7 cm
8 th
9 tm
10 wh
12 wm
*/
void make_pcc1_string(unsigned char *string,int t,int p,int ch,int th,int wh,int cm,int tm,int wm,int type)
{
string[3] = (char) t;
string[4] = (char) type;
string[5] = (char) p;
string[6] = (char) ch;
string[7] = (char) cm;
string[8] = (char) th;
string[9] = (char) tm;
byte2_to_char(&string[10],wh);
byte2_to_char(&string[12],wm);
}
/* pcc2_string
position element
3..4 w
5 t
6 type
7 p
8 ch
9 cm
10 th
11 tm
12 wh
14 wm
*/
void make_pcc2_string(unsigned char *string,int w,int t,int p,int ch,int th,int wh,int cm,int tm,int wm,int type)
{
byte2_to_char(&string[3],w);
string[5] = (char) t;
string[6] = (char) type;
string[7] = (char) p;
string[8] = (char) ch;
string[9] = (char) cm;
string[10] = (char) th;
string[11] = (char) tm;
byte2_to_char(&string[12],wh);
byte2_to_char(&string[14],wm);
}
/*make string for P(gap | wh,ch,th, p)
position element
3 gap
4 ch
5 p
6 th
7..8 wh
*/
void make_gap_string(unsigned char *string,int gap,int ch,int wh,int th,int p)
{
string[3] = (char) gap;
string[4] = (char) ch;
string[5] = (char) p;
string[6] = (char) th;
byte2_to_char(&string[7],wh);
}