package com.nexwave.nquindexer;

import com.nexwave.nsidita.DocFileInfo;
import com.nexwave.stemmer.snowball.SnowballStemmer;
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;
import com.nexwave.stemmer.snowball.ext.GermanStemmer;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/* loaded from: input_file:com/nexwave/nquindexer/SaxHTMLIndex.class */
public class SaxHTMLIndex extends SaxDocFileParser {
    private Map<String, String> tempDico;
    private int i;
    private ArrayList<String> cleanUpList;
    private ArrayList<String> cleanUpPunctuation;

    public SaxHTMLIndex() {
        this.i = 0;
        this.cleanUpList = null;
        this.cleanUpPunctuation = null;
    }

    public SaxHTMLIndex(ArrayList<String> arrayList) {
        this.i = 0;
        this.cleanUpList = null;
        this.cleanUpPunctuation = null;
        this.cleanUpList = arrayList;
    }

    public SaxHTMLIndex(ArrayList<String> arrayList, ArrayList<String> arrayList2) {
        this.i = 0;
        this.cleanUpList = null;
        this.cleanUpPunctuation = null;
        this.cleanUpList = arrayList;
        this.cleanUpPunctuation = arrayList2;
    }

    public int init(Map<String, String> map) {
        this.tempDico = map;
        return 0;
    }

    public DocFileInfo runExtractData(File file, String str) {
        String[] strArr;
        this.fileDesc = new DocFileInfo(file);
        this.strbf = new StringBuffer("");
        parseDocument(file);
        String replaceAll = cleanBuffer(this.strbf).replaceAll("\\s+", " ");
        String[] split = replaceAll.split("\\s");
        if (str.equalsIgnoreCase("ja") || str.equalsIgnoreCase("zh") || str.equalsIgnoreCase("ko")) {
            LinkedList linkedList = new LinkedList();
            try {
                TokenStream tokenStream = new CJKAnalyzer(Version.LUCENE_30).tokenStream("", new StringReader(replaceAll));
                TermAttribute addAttribute = tokenStream.addAttribute(TermAttribute.class);
                tokenStream.addAttribute(OffsetAttribute.class);
                while (tokenStream.incrementToken()) {
                    linkedList.add(addAttribute.term());
                }
                strArr = (String[]) linkedList.toArray(new String[linkedList.size()]);
            } catch (IOException e) {
                strArr = split;
                System.out.println("Error tokenizing content using CJK Analyzer. IOException");
                e.printStackTrace();
            }
        } else {
            SnowballStemmer englishStemmer = str.equalsIgnoreCase("en") ? new EnglishStemmer() : str.equalsIgnoreCase("de") ? new GermanStemmer() : str.equalsIgnoreCase("fr") ? new FrenchStemmer() : null;
            strArr = englishStemmer != null ? englishStemmer.doStem(split) : split;
        }
        HashSet hashSet = new HashSet();
        hashSet.addAll(Arrays.asList(strArr));
        Iterator it = hashSet.iterator();
        while (it.hasNext()) {
            String str2 = (String) it.next();
            if (this.tempDico.containsKey(str2)) {
                this.tempDico.put(str2, this.tempDico.get(str2).concat(",").concat(Integer.toString(this.i)));
            } else {
                this.tempDico.put(str2, Integer.toString(this.i));
            }
        }
        this.i++;
        return this.fileDesc;
    }

    private String cleanBuffer(StringBuffer stringBuffer) {
        String lowerCase = stringBuffer.toString().toLowerCase();
        StringBuffer stringBuffer2 = new StringBuffer("");
        StringBuffer stringBuffer3 = new StringBuffer("");
        if (this.cleanUpList == null || this.cleanUpList.isEmpty()) {
            stringBuffer2.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");
            stringBuffer2.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");
            stringBuffer2.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");
            stringBuffer2.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");
            stringBuffer2.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            stringBuffer2.append("|\\bI\\b|\\bme\\b|\\bmy\\b");
            lowerCase = lowerCase.replaceFirst("Copyright ï¿½ 1998-2007 NexWave Solutions.", " ");
        } else {
            stringBuffer2.append("\\ba\\b");
            Iterator<String> it = this.cleanUpList.iterator();
            while (it.hasNext()) {
                stringBuffer2.append("|\\b" + ((Object) it.next()) + "\\b");
            }
        }
        if (this.cleanUpPunctuation != null && !this.cleanUpPunctuation.isEmpty()) {
            stringBuffer3.append("\\u3002");
            Iterator<String> it2 = this.cleanUpPunctuation.iterator();
            while (it2.hasNext()) {
                stringBuffer3.append("|" + ((Object) it2.next()));
            }
        }
        return minimalClean(lowerCase, stringBuffer2, stringBuffer3);
    }

    private String minimalClean(String str, StringBuffer stringBuffer, StringBuffer stringBuffer2) {
        String str2 = new String(stringBuffer2);
        String replaceAll = str.replaceAll("\\s+", " ").replaceAll("->", " ").replaceAll(IndexerConstants.EUPUNCTUATION1, " ").replaceAll(IndexerConstants.EUPUNCTUATION2, " ").replaceAll(IndexerConstants.JPPUNCTUATION1, " ").replaceAll(IndexerConstants.JPPUNCTUATION2, " ").replaceAll(IndexerConstants.JPPUNCTUATION3, " ");
        if (str2.length() > 0) {
            replaceAll = replaceAll.replaceAll(str2, " ");
        }
        String replaceAll2 = replaceAll.replaceAll(stringBuffer.toString(), " ").replaceAll(IndexerConstants.EUPUNCTUATION1, " ").replaceAll(IndexerConstants.EUPUNCTUATION2, " ").replaceAll(IndexerConstants.JPPUNCTUATION1, " ").replaceAll(IndexerConstants.JPPUNCTUATION2, " ").replaceAll(IndexerConstants.JPPUNCTUATION3, " ");
        if (str2.length() > 0) {
            replaceAll2 = replaceAll2.replaceAll(str2, " ");
        }
        return replaceAll2;
    }
}
