Logo Search packages:      
Sourcecode: lttoolbox version File versions  Download package

fst_processor.h
/*
 * Copyright (C) 2005 Universitat d'Alacant / Universidad de Alicante
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
 * 02111-1307, USA.
 */

#ifndef _FSTPROCESSOR_
#define _FSTPROCESSOR_

#include <lttoolbox/alphabet.h>
#include <lttoolbox/buffer.h>
#include <lttoolbox/ltstr.h>
#include <lttoolbox/my_stdio.h>
#include <lttoolbox/state.h>
#include <lttoolbox/trans_exe.h>

#include <cwchar>
#include <map>
#include <queue>
#include <set>
#include <string>

using namespace std;

/**
 * Kind of output of the generator module
 */
enum GenerationMode
{
  gm_clean,      // clear all
  gm_unknown,    // display unknown words, clear transfer and generation tags
  gm_all         // display all
};

/**
 * Class that implements the FST-based modules of the system
 */
00051 class FSTProcessor
{
private:
  /**
   * Transducers in FSTP
   */
00057   map<wstring, TransExe, Ltstr> transducers;

  /**
   * Current state of lexical analysis
   */
00062   State current_state;

  /**
   * Initial state of every token
   */
00067   State initial_state;

  /**
   * Set of final states of inconditional sections in the dictionaries
   */
00072   set<Node *> inconditional;

  /**
   * Set of final states of standard sections in the dictionaries
   */
00077   set<Node *> standard;

  /**
   * Set of final states of postblank sections in the dictionaries
   */
00082   set<Node *> postblank;

  /**
   * Set of final states of preblank sections in the dictionaries
   */
00087   set<Node *> preblank;

  /**
   * Merge of 'inconditional', 'standard', 'postblank' and 'preblank' sets
   */
00092   set<Node *> all_finals;

  /**
   * Queue of blanks, used in reading methods
   */
00097   queue<wstring> blankqueue;

  /**
   * Set of characters being considered alphabetics
   */
00102   set<wchar_t> alphabetic_chars;

  /**
   * Set of characters to escape with a backslash
   */
00107   set<wchar_t> escaped_chars;

  /**
   * Alphabet
   */
00112   Alphabet alphabet;

  /**
   * Input buffer
   */
00117   Buffer<int> input_buffer;

  /**
   * Begin of the transducer
   */
00122   Node root;

  /**
   * true if the position of input stream is out of a word
   */
00127   bool outOfWord;


  /**
   * if true, makes always difference between uppercase and lowercase
   * characters
   */
00134   bool caseSensitive;

  /**
   * if true, flush the output when the null character is found
   */
00139   bool nullFlush;

  /**
   * Prints an error of input stream and exits
   */
  void streamError();

  /**
   * Reads a character that is defined in the set of escaped_chars
   * @param input the stream to read from
   * @return code of the character
   */
  wchar_t readEscaped(FILE *input);

  /**
   * Reads a block from the stream input, enclosed by delim1 and delim2
   * @param input the stream being read
   * @param delim1 the delimiter of the beginning of the sequence
   * @param delim1 the delimiter of the end of the sequence
   */
  wstring readFullBlock(FILE *input, wchar_t const delim1, wchar_t const delim2);

  /**
   * Returns true if the character code is identified as alphabetic
   * @param c the code provided by the user
   * @return true if it's alphabetic
   */
  bool isAlphabetic(wchar_t const c) const;

  /**
   * Tests if a character is in the set of escaped_chars
   * @param c the character code provided by the user
   * @return true if it is in the set
   */
  bool isEscaped(wchar_t const c) const;

  /**
   * Read text from stream (analysis version, also used in postgeneration)
   * @param input the stream to read
   * @return the next symbol in the stream
   */
  int readAnalysis(FILE *input);

  /**
   * Read text from stream (postgeneration version)
   * @param input the stream to read
   * @return the next symbol in the stream
   */
  int readPostgeneration(FILE *input);

  /**
   * Read text from stream (generation version)
   * @param input the stream to read
   * @return the next symbol in the stream
   */
  int readGeneration(FILE *input, FILE *output);

  /**
   * Read text from stream (SAO version)
   * @param input the stream to read
   * @return the next symbol in the stream
   */
  int readSAO(FILE *input);

  /**
   * Flush all the blanks remaining in the current process
   * @param output stream to write blanks
   */
  void flushBlanks(FILE *output);

  /**
   * Calculate the initial state of parsing
   */
  void calcInitial();

  /**
   * Calculate all the results of the word being parsed
   */
  void classifyFinals();

  /**
   * Write a string to an output stream,
   * @param str the string to write, escaping characters
   * @param output the stream to write in
   */
  void writeEscaped(wstring const &str, FILE *output);

  /**
   * Checks if an string ends with a particular suffix
   * @param str the string to test
   * @param the searched suffix
   * @returns true if 'str' has the suffix 'suffix'
   */
  static bool endsWith(wstring const &str, wstring const &suffix);

  /**
   * Prints a word
   * @param sf surface form of the word
   * @param lf lexical form of the word
   * @param output stream where the word is written
   */
  void printWord(wstring const &sf, wstring const &lf, FILE *output);

  /**
   * Prints a word, SAO version
   * @param lf lexical form
   * @param output stream where the word is written
   */
  void printSAOWord(wstring const &lf, FILE *output);

  /**
   * Prints an unknown word
   * @param sf surface form of the word
   * @param output stream where the word is written
   */
  void printUnknownWord(wstring const &sf, FILE *output);

  vector<wstring> numbers;
  int readTMAnalysis(FILE *input);

  unsigned int lastBlank(wstring const &str);
  void printSpace(wchar_t const val, FILE *output);
  void skipUntil(FILE *input, FILE *output, wint_t const character);
  static wstring removeTags(wstring const &str);
  size_t firstNotAlpha(wstring const &sf);

  void analysis_wrapper_null_flush(FILE *input, FILE *output);
  void generation_wrapper_null_flush(FILE *input, FILE *output,
                                     GenerationMode mode);
  void postgeneration_wrapper_null_flush(FILE *input, FILE *output);
  void transliteration_wrapper_null_flush(FILE *input, FILE *output);


  bool isLastBlankTM;
public:
  FSTProcessor();
  ~FSTProcessor();

  void initAnalysis();
  void initTMAnalysis();
  void initSAO(){initAnalysis();};
  void initGeneration();
  void initPostgeneration();
  void initBiltrans();

  void analysis(FILE *input = stdin, FILE *output = stdout);
  void tm_analysis(FILE *input = stdin, FILE *output = stdout);
  void generation(FILE *input = stdin, FILE *output = stdout, GenerationMode mode = gm_unknown);
  void postgeneration(FILE *input = stdin, FILE *output = stdout);
  void transliteration(FILE *input = stdin, FILE *output = stdout);
  wstring biltrans(wstring const &input_word, bool with_delim = true);
  pair<wstring, int> biltransWithQueue(wstring const &input_word, bool with_delim = true);
  wstring biltransWithoutQueue(wstring const &input_word, bool with_delim = true);
  void SAO(FILE *input = stdin, FILE *output = stdout);

  void load(FILE *input);

  bool valid() const;

  void setCaseSensitiveMode(bool const value);
  void setNullFlush(bool const value);
  bool getNullFlush();
};

#endif

Generated by  Doxygen 1.6.0   Back to index