package org.archive.resource.html;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.text.charset.CharsetDetector;
import org.archive.format.text.charset.StandardCharsetDetector;
import org.archive.format.text.html.CDATALexer;
import org.archive.format.text.html.LexParser;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceContainer;
import org.archive.resource.ResourceFactory;
import org.archive.resource.ResourceParseException;
import org.htmlparser.lexer.Page;
import org.htmlparser.util.ParserException;
import org.json.JSONException;
import org.json.JSONObject;

/* loaded from: input_file:org/archive/resource/html/HTMLResourceFactory.class */
public class HTMLResourceFactory implements ResourceFactory {
    public static final Log LOG = LogFactory.getLog(HTMLResourceFactory.class);
    protected static final int CHARSET_GUESS_CHUNK_SIZE = 8192;
    protected static final String HTTP_HEADER_PATH = "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers";
    protected CharsetDetector charSetDetector = new StandardCharsetDetector();

    @Override // org.archive.resource.ResourceFactory
    public Resource getResource(InputStream inputStream, MetaData metaData, ResourceContainer resourceContainer) throws ResourceParseException, IOException {
        HTMLMetaData hTMLMetaData = new HTMLMetaData(metaData);
        LexParser lexParser = new LexParser(new ExtractingParseObserver(hTMLMetaData));
        CDATALexer cDATALexer = new CDATALexer();
        String str = "UTF-8";
        BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream, 8192);
        byte[] bArr = new byte[8192];
        bufferedInputStream.mark(0);
        int read = bufferedInputStream.read(bArr, 0, 8192);
        bufferedInputStream.reset();
        if (read > 0) {
            JSONObject extractObject = JSONUtils.extractObject(hTMLMetaData.getTopMetaData(), HTTP_HEADER_PATH);
            HttpHeaders httpHeaders = new HttpHeaders();
            if (extractObject.has("Content-Type")) {
                try {
                    httpHeaders.add("Content-Type", extractObject.getString("Content-Type"));
                } catch (JSONException e) {
                }
            }
            try {
                str = this.charSetDetector.getCharset(bArr, read, httpHeaders);
            } catch (Exception e2) {
                LOG.error("Failed to guess charset: " + e2.getMessage());
            }
        }
        try {
            cDATALexer.setPage(new Page(bufferedInputStream, str));
            lexParser.doParse(cDATALexer);
            return new HTMLResource(hTMLMetaData, resourceContainer);
        } catch (UnsupportedEncodingException e3) {
            e3.printStackTrace();
            throw new ResourceParseException(e3);
        } catch (OutOfMemoryError e4) {
            throw new ResourceParseException(null);
        } catch (ParserException e5) {
            e5.printStackTrace();
            throw new ResourceParseException(e5);
        }
    }
}
