|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Objectsg.edu.nus.comp.nlp.ims.corpus.ACorpus
public abstract class ACorpus
abstract corpus.
Field Summary | |
---|---|
protected static int |
g_LIDX
|
protected static int |
g_PIDX
|
protected static int |
g_TIDX
|
protected java.util.ArrayList<java.lang.Integer> |
m_Boundaries
|
protected java.lang.String |
m_DefaultDelimiter
|
protected java.lang.String |
m_Delimiter
|
protected java.util.ArrayList<java.lang.String> |
m_DocIDs
|
protected java.util.ArrayList<java.lang.String> |
m_IDs
|
protected java.util.ArrayList<java.lang.Integer> |
m_Indice
|
protected java.util.ArrayList<java.lang.String> |
m_InstanceLemmas
|
protected java.util.ArrayList<java.lang.String> |
m_InstancePOSs
|
protected java.util.ArrayList<java.lang.String> |
m_InstanceTokens
|
protected boolean |
m_Lemmatized
|
protected ILemmatizer |
m_Lemmatizer
|
protected java.util.ArrayList<java.lang.Integer> |
m_Lengths
|
protected java.util.ArrayList<java.lang.String> |
m_LexeltIDs
|
protected boolean |
m_POSTagged
|
protected IPOSTagger |
m_POSTagger
|
protected boolean |
m_Ready
|
protected java.util.Hashtable<java.lang.String,java.lang.Integer> |
m_SatID2Index
|
protected java.util.ArrayList<java.lang.String[]> |
m_SatIDs
|
protected java.util.ArrayList<java.lang.Integer> |
m_SatIndice
|
protected java.util.ArrayList<java.lang.Integer> |
m_SatSentenceIDs
|
protected java.util.ArrayList<java.lang.Integer> |
m_SentenceIDs
|
protected java.util.ArrayList<ISentence> |
m_Sentences
|
protected ISentenceSplitter |
m_SentenceSplitter
|
protected boolean |
m_Split
|
protected java.util.ArrayList<java.lang.String[]> |
m_Tags
|
protected boolean |
m_Tokenized
|
protected ITokenizer |
m_Tokenizer
|
Constructor Summary | |
---|---|
ACorpus()
default constructor |
|
ACorpus(IPOSTagger p_POSTagger,
ISentenceSplitter p_Splitter,
ITokenizer p_Tokenizer,
ILemmatizer p_Lemmatizer)
constructor with some components |
Method Summary | |
---|---|
void |
clear()
clear the corpus |
protected void |
genInfo()
collection some information |
int |
getIndexInSentence(int p_Index)
get the index of an instance in sentence |
int |
getLength(int p_Index)
get number of words of instance p_Index |
int |
getLowerBoundary(int p_Sentence)
get lower boundary |
ISentence |
getSentence(int p_SentenceID)
get the sentence |
int |
getSentenceID(int p_Index)
get the id of sentence which contains the instance |
java.lang.String[] |
getTag(int p_Index)
get the class of an instance |
int |
getUpperBoundary(int p_Sentence)
get upper boundary |
java.lang.String |
getValue(int p_Index,
java.lang.String p_Key)
get special value of key of instance index |
protected boolean |
isReady()
whether the corpus is ready |
protected boolean |
isValidInstance(int p_Index)
check whether the instance is valid |
protected boolean |
isValidSentence(int p_Index)
check whether the sentence index is valid |
protected void |
lemmatize()
lemmatizing |
abstract boolean |
load(java.io.Reader p_Reader)
load data into corpus |
int |
numOfSentences()
get the number of sentences |
protected void |
posTag()
pos tagging |
void |
setDelimiter(java.lang.String p_Delimiter)
set delimiter |
void |
setLemmatized(boolean p_Lemmatized)
whether the lemma info is provided |
void |
setPOSTagged(boolean p_POSTagged)
whether the pos info is provided |
void |
setSplit(boolean split)
whether the input is already split |
void |
setTokenized(boolean tokenized)
whether sentences are already tokenized |
int |
size()
get the number of instances |
protected void |
tokenize(java.util.ArrayList<java.util.ArrayList<java.lang.String>> p_Texts)
tokenize the texts |
protected abstract void |
tokenizeSentence(java.lang.String p_Sentence)
tokenize a sentence |
java.lang.String |
toString()
|
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait |
Field Detail |
---|
protected static final int g_TIDX
protected static final int g_LIDX
protected static final int g_PIDX
protected boolean m_Ready
protected java.util.ArrayList<ISentence> m_Sentences
protected java.util.ArrayList<java.lang.String> m_IDs
protected java.util.ArrayList<java.lang.String> m_DocIDs
protected java.util.ArrayList<java.lang.String> m_InstanceTokens
protected java.util.ArrayList<java.lang.String> m_InstanceLemmas
protected java.util.ArrayList<java.lang.String> m_InstancePOSs
protected java.util.ArrayList<java.lang.String> m_LexeltIDs
protected java.util.ArrayList<java.lang.Integer> m_SentenceIDs
protected java.util.ArrayList<java.lang.Integer> m_Indice
protected java.util.ArrayList<java.lang.Integer> m_Lengths
protected java.util.ArrayList<java.lang.String[]> m_Tags
protected java.util.ArrayList<java.lang.String[]> m_SatIDs
protected java.util.Hashtable<java.lang.String,java.lang.Integer> m_SatID2Index
protected java.util.ArrayList<java.lang.Integer> m_SatSentenceIDs
protected java.util.ArrayList<java.lang.Integer> m_SatIndice
protected java.util.ArrayList<java.lang.Integer> m_Boundaries
protected java.lang.String m_DefaultDelimiter
protected java.lang.String m_Delimiter
protected IPOSTagger m_POSTagger
protected boolean m_POSTagged
protected ISentenceSplitter m_SentenceSplitter
protected boolean m_Split
protected ITokenizer m_Tokenizer
protected boolean m_Tokenized
protected ILemmatizer m_Lemmatizer
protected boolean m_Lemmatized
Constructor Detail |
---|
public ACorpus()
public ACorpus(IPOSTagger p_POSTagger, ISentenceSplitter p_Splitter, ITokenizer p_Tokenizer, ILemmatizer p_Lemmatizer)
p_POSTagger
- pos taggerp_Splitter
- sentence splitterp_Tokenizer
- tokenizerp_Lemmatizer
- lemmatizerMethod Detail |
---|
public void setSplit(boolean split)
split
- whether splitpublic void setTokenized(boolean tokenized)
tokenized
- whether tokenizedpublic void setLemmatized(boolean p_Lemmatized)
p_Lemmatized
- whether lemmatizedpublic void setPOSTagged(boolean p_POSTagged)
p_POSTagged
- whether pos taggedpublic void setDelimiter(java.lang.String p_Delimiter)
p_Delimiter
- delimiterpublic abstract boolean load(java.io.Reader p_Reader) throws java.lang.Exception
ICorpus
load
in interface ICorpus
p_Reader
- reader of the input stream
java.lang.Exception
- exception while loading filepublic ISentence getSentence(int p_SentenceID)
ICorpus
getSentence
in interface ICorpus
p_SentenceID
- sentence number
public int getIndexInSentence(int p_Index)
ICorpus
getIndexInSentence
in interface ICorpus
p_Index
- instance index
public int getLength(int p_Index)
ICorpus
getLength
in interface ICorpus
p_Index
- instance index
public int size()
ICorpus
size
in interface ICorpus
public int numOfSentences()
ICorpus
numOfSentences
in interface ICorpus
public int getSentenceID(int p_Index)
ICorpus
getSentenceID
in interface ICorpus
p_Index
- instance index
public java.lang.String[] getTag(int p_Index)
ICorpus
getTag
in interface ICorpus
p_Index
- instance index
public void clear()
ICorpus
clear
in interface ICorpus
public int getLowerBoundary(int p_Sentence)
ICorpus
getLowerBoundary
in interface ICorpus
p_Sentence
- sentence number
public int getUpperBoundary(int p_Sentence)
ICorpus
getUpperBoundary
in interface ICorpus
p_Sentence
- sentence number
public java.lang.String getValue(int p_Index, java.lang.String p_Key)
ICorpus
getValue
in interface ICorpus
p_Index
- instance indexp_Key
- value key
public java.lang.String toString()
toString
in class java.lang.Object
protected void tokenize(java.util.ArrayList<java.util.ArrayList<java.lang.String>> p_Texts)
p_Texts
- texts
java.lang.Exception
protected abstract void tokenizeSentence(java.lang.String p_Sentence)
p_Sentence
- input sentenceprotected void posTag()
protected void lemmatize()
protected void genInfo()
protected boolean isReady()
protected boolean isValidInstance(int p_Index)
p_Index
- instance index
protected boolean isValidSentence(int p_Index)
p_Index
- sentence index
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |