/*
 * Decompiled with CFR 0.152.
 */
package com.kingdee.eas.cp.eip.fts.documents;

import com.kingdee.eas.cp.eip.fts.documents.A_TextExtractor;
import com.kingdee.eas.cp.eip.fts.documents.DOMContentUtils;
import com.kingdee.eas.cp.eip.fts.documents.ExtractionResult;
import com.kingdee.eas.cp.eip.fts.documents.I_ExtractionResult;
import com.kingdee.eas.cp.eip.fts.documents.I_TextExtractor;
import com.kingdee.eas.cp.eip.fts.documents.StringUtil;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.SystemUtils;
import org.apache.html.dom.HTMLDocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.InputSource;

public class ExtractorHtml
extends A_TextExtractor {
    private static final ExtractorHtml INSTANCE = new ExtractorHtml();
    private static final int CHUNK_SIZE = 2000;
    private static Pattern metaPattern = Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>", 2);
    private static Pattern charsetPattern = Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)", 2);
    private static String defaultCharEncoding = "utf-8";
    private static HashMap encodingAliases = new HashMap();

    public static I_TextExtractor getExtractor() {
        return INSTANCE;
    }

    private static String sniffCharacterEncoding(byte[] content) {
        Matcher charsetMatcher;
        int length = content.length < 2000 ? content.length : 2000;
        String str = new String(content, 0, 0, length);
        Matcher metaMatcher = metaPattern.matcher(str);
        String encoding = null;
        if (metaMatcher.find() && (charsetMatcher = charsetPattern.matcher(metaMatcher.group(1))).find()) {
            encoding = new String(charsetMatcher.group(1));
        }
        return encoding;
    }

    public I_ExtractionResult extractText(InputStream in, String encoding) throws Exception {
        DOMFragmentParser parser = new DOMFragmentParser();
        String text = "";
        String title = "";
        InputSource input = null;
        try {
            HTMLDocumentImpl impl = new HTMLDocumentImpl();
            impl.setErrorChecking(false);
            DocumentFragment root = impl.createDocumentFragment();
            input = new InputSource(this.getStreamCopy(in));
            encoding = ExtractorHtml.sniffCharacterEncoding(this.getContentByte(in));
            if (encoding != null) {
                if ((encoding = StringUtil.resolveEncodingAlias(encoding)) != null) {
                    input.setEncoding(encoding);
                }
            } else {
                input.setEncoding(defaultCharEncoding);
            }
            parser.parse(input, root);
            StringBuffer sb = new StringBuffer();
            DOMContentUtils.getText(sb, root);
            text = sb.toString();
            sb.setLength(0);
            DOMContentUtils.getTitle(sb, root);
            title = sb.toString().trim();
            return new ExtractionResult(title + text);
        }
        catch (Exception ex) {
            throw new Exception(ex.toString());
        }
    }

    private byte[] getContentByte(InputStream is) throws UnsupportedEncodingException, IOException {
        String line;
        InputStreamReader read = new InputStreamReader(is);
        BufferedReader in = new BufferedReader(read);
        String content = "";
        while ((line = in.readLine()) != null) {
            content = content + new String(line.getBytes(), SystemUtils.FILE_ENCODING);
        }
        byte[] contentInOctets = content.getBytes();
        return contentInOctets;
    }
}

