// $Id: WordByWordInfilter.java 4699 2008-10-16 14:19:53Z nigelw $
// Copyright (c) 2006 DeltaXML Ltd. All rights reserved

package com.deltaxml.pipe.filters;

import java.util.ArrayList;
import java.util.EmptyStackException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import java.util.StringTokenizer;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import com.deltaxml.pipe.XMLFilterImpl2;

import com.deltaxml.api.DeltaXMLError;

/**
 * <p>Wraps words in a <code>&lt;deltaxml:word&gt;</code> element, punctuation in a 
 * <code>&lt;deltaxml:punctuation&gt;</code> element and whitespace in a
 * <code>&lt;deltaxml:space&gt;</code> element. Any ancestor elements marked as formatting
 * elements (using the <code>deltaxml:format="true"</code> attribute) are removed and encoded 
 * as attributes on the word and space elements that they enclose. Any attributes that the 
 * format tag had are also encoded.</p> 
 * 
 * <p><strong>Example</strong> The xml snippet:
 * <pre>
 * &lt;p&gt;sample of &lt;b deltaxml:format="true" id="bold-tag"&gt;format&lt;/b&gt; tags!&lt;/p&gt;
 * </pre>
 * would be converted to (pretty printed)
 * <pre>
 * &lt;p&gt;
 * &nbsp;&nbsp;&lt;deltaxml:word&gt;sample&lt;/deltaxml:word&gt;
 * &nbsp;&nbsp;&lt;deltaxml:space&gt;&nbsp;&lt;/deltaxml:space&gt;
 * &nbsp;&nbsp;&lt;deltaxml:word&gt;of&lt;/deltaxml:word&gt;
 * &nbsp;&nbsp;&lt;deltaxml:space&gt;&nbsp;&lt;/deltaxml:space&gt;
 * &nbsp;&nbsp;&lt;deltaxml:word b="id='bold-tag'"&gt;format&lt;/deltaxml:word&gt;
 * &nbsp;&nbsp;&lt;deltaxml:space&gt;&nbsp;&lt;/deltaxml:space&gt;
 * &nbsp;&nbsp;&lt;deltaxml:word&gt;tags&lt;/deltaxml:word&gt;
 * &nbsp;&nbsp;&lt;deltaxml:punctuation&gt;!&lt;/deltaxml:punctuation&gt;
 * &lt;/p&gt;
 * </pre>
 * </p>
 * 
 * <p>To prevent detailed comparison of a specific section of the XML
 * file, the <code>deltaxml:word-by-word</code> attribute should be
 * included in an element and set to <code>false</code>. This
 * attribute applies to a complete subtree beneath that element unless
 * overriden by another <code>deltaxml:word-by-word</code>
 * attribute at a lower level. <strong>Note: This will also
 * prevent formatting elements from being processed as above - they will be
 * output as elements.</strong></p>
 * 
 * <p>The use of <code>xml:space="preserve"</code> on an element will
 * also have the effect of preventing detailed comparison.</p>
 * 
 * <p>To change the characters that should be treated as punctuation, the
 * <code>deltaxml:punctuation</code> attribute should be included in an element
 * with the punctuation characters in a space separated list. This
 * attribute applies to a complete subtree beneath that element unless
 * overriden by another <code>deltaxml:punctuation</code>
 * attribute at a lower level. If no <code>deltaxml:punctuation</code> attribute
 * is found in the input then punctuation will not be wrapped in elements; ie the
 * default set of punctuation characters is the empty set unless overridden.
 * A suggested/possible set of characters could be included with this attribute
 * <code>deltaxml:punctuation=". , ; : ! ? ( ) &apos; &quot; \\ /"</code>.</p>
 * 
 * <p><code>WordByWordInfilter</code> should always be used in conjunction with
 * {@link WordByWordOutfilter1}. It is designed to be used as a pre-filter and should be
 * placed on both <code>XMLComparator</code> input pipelines.</p>
 * 
 * <p><strong>Note: </strong>This class has not been designed to be extended,
 * therefore to err on the side of caution, it has been declared final.</p>
 * 
 * @version $Id: WordByWordInfilter.java 4699 2008-10-16 14:19:53Z nigelw $
 * @author Tristan Mitchell
 * @see com.deltaxml.pipe.filters.WordByWordOutfilter1
 *
 */
public final class WordByWordInfilter extends XMLFilterImpl2
{
  private final String DELTAXML_NS= "http://www.deltaxml.com/ns/well-formed-delta-v1";
  private final String DELTAXML_PREFIX= "deltaxml";
  private final String XML_NS= "http://www.w3.org/XML/1998/namespace";
  private final String XML_SPACE_LOCAL_NAME= "space";
  private final String WBW_LOCAL_NAME= "word-by-word";
  private final String PUNCTUATION_LOCAL_NAME= "punctuation";
  private final String FORMAT_LOCAL_NAME= "format";
  
  private Stack xmlSpaceStack= new Stack();
  private Stack wordByWordStack= new Stack();
  private Stack formatStack= new Stack(); //is the current element a formatting element? Needed to remove elements from currentFormat List
  private Stack punctuationCharsStack= new Stack();
  
  private StringBuffer storedChars= new StringBuffer();
  
  private List currentFormat= new ArrayList();
  
  // The following incompatible setting made it into the 4.1.2 release. Reverting back
  // to ensure compatibility for existing customers.
  //  private String defaultPunctuation= ". , ; : ! ? ( ) ' \" \\ /";
  private String defaultPunctuation= "";
  
  private char [] delimiters= { '"', '\'', '|', '~', '%', '^', '+', '_', '`', '/', '\\', '$', '?', ',', ';', '!' };

  private boolean isCharacterByCharacter= false;

  /**
   * Used to control where &lt;deltaxml:word&gt; elements contain a single character or a word.
   * @param cbc sets character-by-character mode when true
   */
  public void setisCharacterByCharacter(boolean cbc) {
    isCharacterByCharacter= cbc;
  }
  
  public boolean getisCharacterByCharacter() {
    return isCharacterByCharacter;
  }

  
  /**
   * Overrides the default <code>startPrefixMapping</code> method. This version of the method performs internal operations.
   * @throws SAXException the superclass may throw an exception during processing
   * @see XMLFilterImpl#startPrefixMapping(String, String)
   */  
  public void startPrefixMapping(String prefix, String uri) throws SAXException {
    if(storedChars.length() > 0) {
      outputCharacters();
    }
    super.startPrefixMapping(prefix, uri);
  }
  
  
  /**
   * Overrides the default <code>endDocument</code> method.
   * @throws SAXException the superclass may throw an Exception during processing
   * @see XMLFilterImpl#endDocument()
   */
  public void endDocument() throws SAXException {
    super.endPrefixMapping(DELTAXML_PREFIX);
    super.endDocument();
  }

  /**
   * Overrides the default <code>startDocument</code> method.
   * @throws SAXException the superclass may throw an Exception during processing
   * @see XMLFilterImpl#startDocument()
   */
  public void startDocument() throws SAXException {
    super.startDocument();
    super.startPrefixMapping(DELTAXML_PREFIX, DELTAXML_NS);
  }

  /**
   * <p>Overrides the default <code>startElement</code> method. This method processes <code>xml:space</code>
   * and <code>deltaxml:word-by-word</code> attributes.</p>
   * 
   * <p>This method also stores details of elements that have the
   * <code>deltaxml:format="true"</code> attribute. These elements are stored in a list and encoded as
   * attributes on any <code>deltaxml:word</code> or <code>deltaxml:space</code> elements that are output.</p>
   * @throws SAXException the superclass may throw an exception during processing
   * @see XMLFilterImpl#startElement(String, String, String, Attributes)
   */
  public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
    if(storedChars.length() > 0) {
      outputCharacters();
    }
    boolean preserve;
    boolean wordByWord;
    List punctuationChars;
    
    try {
      preserve= ((Boolean)xmlSpaceStack.peek()).booleanValue();
    } catch (EmptyStackException e) {
      preserve= false; // by default we don't preserve any whitespace
    }
    try {
      wordByWord= ((Boolean)wordByWordStack.peek()).booleanValue();
    } catch (EmptyStackException e) {
      wordByWord= true; //by default we ARE splitting words
    }
    try {
      punctuationChars= (List)punctuationCharsStack.peek();
    } catch (EmptyStackException e) {
      punctuationChars= parsePunctuationString(defaultPunctuation); //use default punctuation 
    }
    
    /**
     * This is a workaround because saxon 6 has a bug if we use atts.getValue(qName)
     * We can get the index based on qName and get the value from the index
     */
    
//    String xmlSpaceVal= atts.getValue(XML_SPACE_ATT_NAME);
    int index= atts.getIndex(XML_NS, XML_SPACE_LOCAL_NAME);
    if(index != -1) {
      /**
       * MUST test that the index is valid. atts.getValue(index) is documented as returning null if the index is out of 
       * but saxon throws an ArrayIndexOutOfBoundsException instead
       */
      String xmlSpaceVal= atts.getValue(index);
      if (xmlSpaceVal != null) {
        if (xmlSpaceVal.equals("preserve")) {
          preserve= true;
        } else if (xmlSpaceVal.equals("default")) {
          preserve= false;
        } 
      }
    }

    String wordByWordVal= atts.getValue(DELTAXML_NS, WBW_LOCAL_NAME);
    if (wordByWordVal != null) {
      if (wordByWordVal.equals("false")) {
        wordByWord= false;
      } else if (wordByWordVal.equals("true")) {
        wordByWord= true;
      }
    }
    
    String punctuation= atts.getValue(DELTAXML_NS, PUNCTUATION_LOCAL_NAME);
    if (punctuation != null) {
      punctuationChars= parsePunctuationString(punctuation);
    }
    
    xmlSpaceStack.push(new Boolean(preserve));
    wordByWordStack.push(new Boolean(wordByWord));
    punctuationCharsStack.push(punctuationChars);
    
    if ("true".equals(atts.getValue(DELTAXML_NS, FORMAT_LOCAL_NAME)) && wordByWord) {
      //we have a deltaxml:format="true" attribute
      formatStack.push(new Boolean(true));
      currentFormat.add(new FormatEntry(uri, localName, qName, atts));
    } else {
      formatStack.push(new Boolean(false));
      super.startElement(uri, localName, qName, atts);
    }
  }
  
  /**
   * Overrides the default <code>endElement</code> method. This version of the method performs internal operations.
   * @throws SAXException the superclass may throw an exception during processing
   * @see XMLFilterImpl#endElement(String, String, String)
   */
  public void endElement(String uri, String localName, String qName) throws SAXException {
    if(storedChars.length() > 0) {
      outputCharacters();
    }
    
    try {
      xmlSpaceStack.pop();
      wordByWordStack.pop();
      punctuationCharsStack.pop();
    } catch (EmptyStackException e) {
      throw new SAXException("Internal Error: Empty Stack in " + this.getClass().getName());
    }
    boolean format;
    try {
      format= ((Boolean)formatStack.pop()).booleanValue();
    } catch (EmptyStackException e) {
      //shouldn't happpen but just in case....
      format= false;
    }

    if (format) {
      currentFormat.remove(currentFormat.size()-1);
    } else {
      super.endElement(uri, localName, qName);
    }
  }
  
  /**
   * Overrides the default <code>characters</code> method. This version of the method stores the characters
   * so that they can be output at a later stage.
   * @throws SAXException the superclass may throw an exception during processing
   * @see XMLFilterImpl#characters(char[], int, int)
   */
  public void characters(char [] ch, int start, int length) throws SAXException {
    storedChars.append(ch, start, length);
  }  

  /**
   * Overrides the default <code>ignorableWhitespace</code> method. This version of the method performs internal operations.
   * @throws SAXException the superclass may throw an exception during processing
   * @see XMLFilterImpl#ignorableWhitespace(char[], int, int)
   */
  public void ignorableWhitespace(char [] ch, int start, int length) throws SAXException {
    if(storedChars.length() > 0) {
      outputCharacters();
    }
  }
  
  private void outputCharacters() throws SAXException {
    if ( !((Boolean)xmlSpaceStack.peek()).booleanValue() && ((Boolean)wordByWordStack.peek()).booleanValue() ) {
      // xmlSpaceStack top is false and wordByWordStack top is true
      StringBuffer whiteSpace= new StringBuffer();
      StringBuffer content= new StringBuffer();
      StringBuffer punctuation= new StringBuffer();
      for (int i= 0; i < storedChars.length(); i++) {
        char currentChar= storedChars.charAt(i);
        if (currentChar == ' ' || currentChar == '\t' || currentChar == '\r' || currentChar == '\n') {
          //we are in whitespace
          if (content.length() > 0) {
            output(content, "word");
            content= new StringBuffer();
          } else if (punctuation.length() > 0) {
            output(punctuation, "punctuation");
            punctuation= new StringBuffer();
          }
          whiteSpace.append(currentChar);
        } else if (isCharacter(i)) {
          //we are in punctuation
          if (content.length() > 0) {
            output(content, "word");
            content= new StringBuffer();
          } else if (whiteSpace.length() > 0) {
            output(whiteSpace, "space");
            whiteSpace= new StringBuffer();
          }
          punctuation.append(currentChar);
        } else {
          //we are in content
          if(whiteSpace.length() > 0) {
            output(whiteSpace, "space");
            whiteSpace= new StringBuffer();
          } else if (punctuation.length() > 0) {
            output(punctuation, "punctuation");
            punctuation= new StringBuffer();
          }
          if (isCharacterByCharacter) {
            content.append(currentChar);
            if (content.length() != 1) {
              throw new SAXException("Internal Error:  CBC mode logical error - found multiple characters when single expected");
            }
            output(content, "word");
            content= new StringBuffer();
          } else { // standard WBW behaviour
            content.append(currentChar);
          }
        }
      }
      if(content.length() > 0) {
        output(content, "word");
      }
      if(whiteSpace.length() > 0) {
        output(whiteSpace, "space");
      }
      if(punctuation.length() > 0) {
        output(punctuation, "punctuation");
      }
    } else {
      char [] copy= new char[storedChars.length()];
      storedChars.getChars(0, storedChars.length(), copy, 0);
      super.characters(copy, 0, copy.length);
    }
    storedChars= new StringBuffer();
  }
  
  private boolean isCharacter(int i) {
    char currentChar= storedChars.charAt(i);
    List currentPunctuationList= (List)punctuationCharsStack.peek();

    if (currentPunctuationList.contains(new Character(currentChar))) {
      int puncBegin= i;
      int puncEnd= i;
      // find beginning of punctuation
      while (puncBegin - 1 >= 0 && currentPunctuationList.contains(new Character(storedChars.charAt(puncBegin - 1)))) {
        puncBegin--;
      }

      // find end of punctuation
      while (puncEnd + 1 < storedChars.length() && currentPunctuationList.contains(new Character(storedChars.charAt(puncEnd + 1)))) {
        puncEnd++;
      }

      // beginning or end of the string
      if (puncBegin == 0 || puncEnd == storedChars.length() - 1) {
        return true;
      }

      // whitespace before or after
      char beforeChar= storedChars.charAt(puncBegin - 1);
      char afterChar= storedChars.charAt(puncEnd + 1);

      if (beforeChar == ' ' || beforeChar == '\t' || beforeChar == '\r' || beforeChar == '\n') {
        return true;
      }
      if (afterChar == ' ' || afterChar == '\t' || afterChar == '\r' || afterChar == '\n') {
        return true;
      }
    }

    return false;
  }

  private void output(StringBuffer sb, String elemType) throws SAXException {
    char [] chars= new char[sb.length()];
    sb.getChars(0, sb.length(), chars, 0);
    AttributesImpl atts= new AttributesImpl();
    if(currentFormat.size() > 0) {
      Set formatSet= new HashSet();
//      formatSet.addAll(currentFormat);
      for (int i= currentFormat.size()-1; i >= 0; i--) {
        formatSet.add(currentFormat.get(i));
      }
      Iterator it= formatSet.iterator();
      while(it.hasNext()) {
        FormatEntry fe= (FormatEntry)it.next();
        atts.addAttribute(fe.getUri(), fe.getLocalName(), fe.getQName(), "CDATA", fe.getEncodedAtts());
      }
    }
    super.startElement(DELTAXML_NS, elemType, DELTAXML_PREFIX + ":" + elemType, atts);
    super.characters(chars, 0, chars.length);
    super.endElement(DELTAXML_NS, elemType, DELTAXML_PREFIX + ":" + elemType);
  }

  private List parsePunctuationString(String defaultPunctuation) {
    List punctuationList= new ArrayList();
    
    StringTokenizer charsST= new StringTokenizer(defaultPunctuation);
    
    while (charsST.hasMoreTokens()) {
      String charToken= charsST.nextToken();
      
      // check that the token is a single char long
      if (charToken.length() == 1) {
        punctuationList.add(new Character(charToken.charAt(0)));
      }
    }    
    return punctuationList;
  }
  
  private class FormatEntry {
    
    private String uri;
    private String localName;
    private String qName;
    private AttributesImpl atts;
    
    FormatEntry(String uri, String localName, String qName, Attributes atts) {
      this.uri= uri;
      this.localName= localName;
      this.qName= qName;
      this.atts= new AttributesImpl(atts);
    }
    
    String getLocalName() {
      return localName;
    }
    
    void setLocalName(String localName) {
      this.localName= localName;
    }
    
    String getUri() {
      return uri;
    }
    
    void setUri(String namespace) {
      this.uri= namespace;
    }
    
    String getQName() {
      return qName;
    }
    
    void setQName(String name) {
      qName= name;
    }
    
    String getEncodedAtts() throws SAXException {
      //let's remove the deltaxml:format attribute before encoding the element's attributes
      int index= atts.getIndex("deltaxml:format");
      if (index != -1) {
        atts.removeAttribute(index);
      }
      String attString= "";
      for (int i= 0; i < atts.getLength(); i++) {
        String qName= atts.getQName(i);
        String value= atts.getValue(i);
        char delimiter;
        int j= 0;
        boolean found= false;
        do {
          delimiter= delimiters[j];
          j++;
          found= (value.indexOf(delimiter) == -1);
        } while(!found && i < delimiters.length);
        if (!found) {
          throw new SAXException("ERROR: Could not find a delimiter for encoding the attribute value '" + value +"'");
        }
        attString += qName + "=" + delimiter + value + delimiter + " ";
          
      }
      return attString;
    }
    
    public int hashCode() {
      return uri.hashCode() | localName.hashCode();
    }
    
    public boolean equals(Object other) {
      if (other instanceof FormatEntry) {
        FormatEntry fe= (FormatEntry)other;
        return uri.equals(fe.uri) && localName.equals(fe.localName);
      } else {
        return false;
      }
    }
    
  }
  
}
