sg.edu.nus.comp.nlp.ims.corpus
Class ACorpus

java.lang.Object
  extended by sg.edu.nus.comp.nlp.ims.corpus.ACorpus
All Implemented Interfaces:
ICorpus
Direct Known Subclasses:
CAllWordsPlainCorpus, CLexicalCorpus

public abstract class ACorpus
extends java.lang.Object
implements ICorpus

abstract corpus.

Author:
zhongzhi

Field Summary
protected static int g_LIDX
           
protected static int g_PIDX
           
protected static int g_TIDX
           
protected  java.util.ArrayList<java.lang.Integer> m_Boundaries
           
protected  java.lang.String m_DefaultDelimiter
           
protected  java.lang.String m_Delimiter
           
protected  java.util.ArrayList<java.lang.String> m_DocIDs
           
protected  java.util.ArrayList<java.lang.String> m_IDs
           
protected  java.util.ArrayList<java.lang.Integer> m_Indice
           
protected  java.util.ArrayList<java.lang.String> m_InstanceLemmas
           
protected  java.util.ArrayList<java.lang.String> m_InstancePOSs
           
protected  java.util.ArrayList<java.lang.String> m_InstanceTokens
           
protected  boolean m_Lemmatized
           
protected  ILemmatizer m_Lemmatizer
           
protected  java.util.ArrayList<java.lang.Integer> m_Lengths
           
protected  java.util.ArrayList<java.lang.String> m_LexeltIDs
           
protected  boolean m_POSTagged
           
protected  IPOSTagger m_POSTagger
           
protected  boolean m_Ready
           
protected  java.util.Hashtable<java.lang.String,java.lang.Integer> m_SatID2Index
           
protected  java.util.ArrayList<java.lang.String[]> m_SatIDs
           
protected  java.util.ArrayList<java.lang.Integer> m_SatIndice
           
protected  java.util.ArrayList<java.lang.Integer> m_SatSentenceIDs
           
protected  java.util.ArrayList<java.lang.Integer> m_SentenceIDs
           
protected  java.util.ArrayList<ISentence> m_Sentences
           
protected  ISentenceSplitter m_SentenceSplitter
           
protected  boolean m_Split
           
protected  java.util.ArrayList<java.lang.String[]> m_Tags
           
protected  boolean m_Tokenized
           
protected  ITokenizer m_Tokenizer
           
 
Constructor Summary
ACorpus()
          default constructor
ACorpus(IPOSTagger p_POSTagger, ISentenceSplitter p_Splitter, ITokenizer p_Tokenizer, ILemmatizer p_Lemmatizer)
          constructor with some components
 
Method Summary
 void clear()
          clear the corpus
protected  void genInfo()
          collection some information
 int getIndexInSentence(int p_Index)
          get the index of an instance in sentence
 int getLength(int p_Index)
          get number of words of instance p_Index
 int getLowerBoundary(int p_Sentence)
          get lower boundary
 ISentence getSentence(int p_SentenceID)
          get the sentence
 int getSentenceID(int p_Index)
          get the id of sentence which contains the instance
 java.lang.String[] getTag(int p_Index)
          get the class of an instance
 int getUpperBoundary(int p_Sentence)
          get upper boundary
 java.lang.String getValue(int p_Index, java.lang.String p_Key)
          get special value of key of instance index
protected  boolean isReady()
          whether the corpus is ready
protected  boolean isValidInstance(int p_Index)
          check whether the instance is valid
protected  boolean isValidSentence(int p_Index)
          check whether the sentence index is valid
protected  void lemmatize()
          lemmatizing
abstract  boolean load(java.io.Reader p_Reader)
          load data into corpus
 int numOfSentences()
          get the number of sentences
protected  void posTag()
          pos tagging
 void setDelimiter(java.lang.String p_Delimiter)
          set delimiter
 void setLemmatized(boolean p_Lemmatized)
          whether the lemma info is provided
 void setPOSTagged(boolean p_POSTagged)
          whether the pos info is provided
 void setSplit(boolean split)
          whether the input is already split
 void setTokenized(boolean tokenized)
          whether sentences are already tokenized
 int size()
          get the number of instances
protected  void tokenize(java.util.ArrayList<java.util.ArrayList<java.lang.String>> p_Texts)
          tokenize the texts
protected abstract  void tokenizeSentence(java.lang.String p_Sentence)
          tokenize a sentence
 java.lang.String toString()
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
 

Field Detail

g_TIDX

protected static final int g_TIDX

g_LIDX

protected static final int g_LIDX

g_PIDX

protected static final int g_PIDX

m_Ready

protected boolean m_Ready

m_Sentences

protected java.util.ArrayList<ISentence> m_Sentences

m_IDs

protected java.util.ArrayList<java.lang.String> m_IDs

m_DocIDs

protected java.util.ArrayList<java.lang.String> m_DocIDs

m_InstanceTokens

protected java.util.ArrayList<java.lang.String> m_InstanceTokens

m_InstanceLemmas

protected java.util.ArrayList<java.lang.String> m_InstanceLemmas

m_InstancePOSs

protected java.util.ArrayList<java.lang.String> m_InstancePOSs

m_LexeltIDs

protected java.util.ArrayList<java.lang.String> m_LexeltIDs

m_SentenceIDs

protected java.util.ArrayList<java.lang.Integer> m_SentenceIDs

m_Indice

protected java.util.ArrayList<java.lang.Integer> m_Indice

m_Lengths

protected java.util.ArrayList<java.lang.Integer> m_Lengths

m_Tags

protected java.util.ArrayList<java.lang.String[]> m_Tags

m_SatIDs

protected java.util.ArrayList<java.lang.String[]> m_SatIDs

m_SatID2Index

protected java.util.Hashtable<java.lang.String,java.lang.Integer> m_SatID2Index

m_SatSentenceIDs

protected java.util.ArrayList<java.lang.Integer> m_SatSentenceIDs

m_SatIndice

protected java.util.ArrayList<java.lang.Integer> m_SatIndice

m_Boundaries

protected java.util.ArrayList<java.lang.Integer> m_Boundaries

m_DefaultDelimiter

protected java.lang.String m_DefaultDelimiter

m_Delimiter

protected java.lang.String m_Delimiter

m_POSTagger

protected IPOSTagger m_POSTagger

m_POSTagged

protected boolean m_POSTagged

m_SentenceSplitter

protected ISentenceSplitter m_SentenceSplitter

m_Split

protected boolean m_Split

m_Tokenizer

protected ITokenizer m_Tokenizer

m_Tokenized

protected boolean m_Tokenized

m_Lemmatizer

protected ILemmatizer m_Lemmatizer

m_Lemmatized

protected boolean m_Lemmatized
Constructor Detail

ACorpus

public ACorpus()
default constructor


ACorpus

public ACorpus(IPOSTagger p_POSTagger,
               ISentenceSplitter p_Splitter,
               ITokenizer p_Tokenizer,
               ILemmatizer p_Lemmatizer)
constructor with some components

Parameters:
p_POSTagger - pos tagger
p_Splitter - sentence splitter
p_Tokenizer - tokenizer
p_Lemmatizer - lemmatizer
Method Detail

setSplit

public void setSplit(boolean split)
whether the input is already split

Parameters:
split - whether split

setTokenized

public void setTokenized(boolean tokenized)
whether sentences are already tokenized

Parameters:
tokenized - whether tokenized

setLemmatized

public void setLemmatized(boolean p_Lemmatized)
whether the lemma info is provided

Parameters:
p_Lemmatized - whether lemmatized

setPOSTagged

public void setPOSTagged(boolean p_POSTagged)
whether the pos info is provided

Parameters:
p_POSTagged - whether pos tagged

setDelimiter

public void setDelimiter(java.lang.String p_Delimiter)
set delimiter

Parameters:
p_Delimiter - delimiter

load

public abstract boolean load(java.io.Reader p_Reader)
                      throws java.lang.Exception
Description copied from interface: ICorpus
load data into corpus

Specified by:
load in interface ICorpus
Parameters:
p_Reader - reader of the input stream
Returns:
ready or not
Throws:
java.lang.Exception - exception while loading file

getSentence

public ISentence getSentence(int p_SentenceID)
Description copied from interface: ICorpus
get the sentence

Specified by:
getSentence in interface ICorpus
Parameters:
p_SentenceID - sentence number
Returns:
sentence

getIndexInSentence

public int getIndexInSentence(int p_Index)
Description copied from interface: ICorpus
get the index of an instance in sentence

Specified by:
getIndexInSentence in interface ICorpus
Parameters:
p_Index - instance index
Returns:
index in sentence

getLength

public int getLength(int p_Index)
Description copied from interface: ICorpus
get number of words of instance p_Index

Specified by:
getLength in interface ICorpus
Parameters:
p_Index - instance index
Returns:
number of words

size

public int size()
Description copied from interface: ICorpus
get the number of instances

Specified by:
size in interface ICorpus
Returns:
size

numOfSentences

public int numOfSentences()
Description copied from interface: ICorpus
get the number of sentences

Specified by:
numOfSentences in interface ICorpus
Returns:
number of sentence

getSentenceID

public int getSentenceID(int p_Index)
Description copied from interface: ICorpus
get the id of sentence which contains the instance

Specified by:
getSentenceID in interface ICorpus
Parameters:
p_Index - instance index
Returns:
sentence number

getTag

public java.lang.String[] getTag(int p_Index)
Description copied from interface: ICorpus
get the class of an instance

Specified by:
getTag in interface ICorpus
Parameters:
p_Index - instance index
Returns:
instance tags

clear

public void clear()
Description copied from interface: ICorpus
clear the corpus

Specified by:
clear in interface ICorpus

getLowerBoundary

public int getLowerBoundary(int p_Sentence)
Description copied from interface: ICorpus
get lower boundary

Specified by:
getLowerBoundary in interface ICorpus
Parameters:
p_Sentence - sentence number
Returns:
lower boundary

getUpperBoundary

public int getUpperBoundary(int p_Sentence)
Description copied from interface: ICorpus
get upper boundary

Specified by:
getUpperBoundary in interface ICorpus
Parameters:
p_Sentence - sentence number
Returns:
upper boundary

getValue

public java.lang.String getValue(int p_Index,
                                 java.lang.String p_Key)
Description copied from interface: ICorpus
get special value of key of instance index

Specified by:
getValue in interface ICorpus
Parameters:
p_Index - instance index
p_Key - value key
Returns:
value

toString

public java.lang.String toString()
Overrides:
toString in class java.lang.Object

tokenize

protected void tokenize(java.util.ArrayList<java.util.ArrayList<java.lang.String>> p_Texts)
tokenize the texts

Parameters:
p_Texts - texts
Throws:
java.lang.Exception

tokenizeSentence

protected abstract void tokenizeSentence(java.lang.String p_Sentence)
tokenize a sentence

Parameters:
p_Sentence - input sentence

posTag

protected void posTag()
pos tagging


lemmatize

protected void lemmatize()
lemmatizing


genInfo

protected void genInfo()
collection some information


isReady

protected boolean isReady()
whether the corpus is ready

Returns:
ready or not

isValidInstance

protected boolean isValidInstance(int p_Index)
check whether the instance is valid

Parameters:
p_Index - instance index
Returns:
valid or not

isValidSentence

protected boolean isValidSentence(int p_Index)
check whether the sentence index is valid

Parameters:
p_Index - sentence index
Returns:
valid or not