package cn.com.duiba.spider.util.maiquan.spider;

import cn.com.duiba.spider.util.maiquan.constant.ContentSource;
import cn.com.duiba.spider.util.maiquan.dto.DefaultDto;
import cn.com.duiba.spider.util.maiquan.exception.ErrorCode;
import cn.com.duiba.spider.util.maiquan.exception.MaiQuanSpiderException;
import cn.com.duiba.spider.util.maiquan.proxy.Data5UProxyProvider;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import javax.net.ssl.SSLSocketFactory;
import java.io.IOException;
import java.net.SocketTimeoutException;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * @author: sjx
 * @date: 2019-05-27 11:21
 */
public abstract class AbstractSpider {

    /**
     * URL格式校验正则
     */
    private static final String URL_REGEX = "(https?|http)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";

    protected AbstractSpider() {
    }

    protected abstract ContentSource getContentSource();

    /**
     * 爬取指定地址并以字符串返回结果
     *
     * @param param
     * @return
     */
    protected String doRequestString(RequestParam param) {
        return doRequest(param).body();
    }

    /**
     * 爬取指定地址并以JSONObject返回结果
     *
     * @param param
     * @return
     */
    protected JSONObject doRequestJSON(RequestParam param) {
        return JSON.parseObject(doRequest(param).body());
    }

    /**
     * 爬取指定地址并以org.jsoup.nodes.Document返回数据
     *
     * @param param
     * @return org.jsoup.nodes.Document
     * @see org.jsoup.nodes.Document
     */
    protected Document doRequestDocument(RequestParam param) {
        try {
            return doRequest(param).parse();
        } catch (IOException e) {
            throw new MaiQuanSpiderException(ErrorCode.E003, e);
        }
    }

    /**
     * 请求指定地址，并返回Connection.Response
     *
     * @param param
     * @return Connection.Response
     * @see org.jsoup.Connection.Response
     */
    protected Connection.Response doRequest(RequestParam param) {
        Connection connection = buildConnection(param);
        try {
            return connection.execute();
        } catch (SocketTimeoutException e) {
            throw new MaiQuanSpiderException(ErrorCode.E006, e);
        } catch (IOException e) {
            throw new MaiQuanSpiderException(ErrorCode.E002, e);
        }
    }

    /**
     * 构建jsoup连接
     * 1.参数校验
     * 2.创建连接并设置基础属性
     * 3.设置headers
     * 4.设置cookies
     * 5.设置requestbody
     * 6.设置代理
     * 7.设置安全证书
     *
     * @param param
     * @return Connection
     * @see org.jsoup.Connection
     */
    private Connection buildConnection(RequestParam param) {
        //1.参数校验
        checkParam(param);
        //2.创建连接并设置基础属性
        Connection connect = Jsoup.connect(param.getUrl()).method(param.getMethod()).followRedirects(true).ignoreHttpErrors(true).ignoreContentType(true).timeout(5 * 1000);
        //3.设置headers
        if (param.getHeaders() != null) {
            connect.headers(param.getHeaders());
        }
        //4.设置cookies
        if (param.getCookies() != null) {
            connect.cookies(param.getCookies());
        }
        //5.设置requestbody
        if (StringUtils.isNotBlank(param.getBody())) {
            connect.requestBody(param.getBody());
        }
        if (param.getData() != null) {
            connect.data(param.getData());
        }
        //6.设置代理
        if (param.isProxy()) {
            connect.proxy(param.getHost(), param.getPort());
        }
        //7.设置安全证书
        if (param.isSSL()) {
            connect.sslSocketFactory(param.getSslSocketFactory());
        }
        if (param.isRedirect()) {
            connect.followRedirects(true);
        }
        return connect;
    }

    /**
     * 爬起指定内容
     *
     * @param param
     * @return
     */
    public abstract DefaultDto unitedStatesSmash(RequestParam param);

    /**
     * 爬起指定内容
     *
     * @param param
     * @return
     */
    public List<DefaultDto> unitedStatesSmashPlus(RequestParam param) {
        return Lists.newArrayList(unitedStatesSmash(param));
    }

    /**
     * 获取评论,由具体爬虫实现，默认返回空
     *
     * @param param 请求对象
     * @return
     */
    public List<DefaultDto.Comment> getComments(RequestParam param) {
        return Lists.newArrayList();
    }

    /**
     * 获取评论,由具体爬虫实现，默认返回空
     *
     * @param body 页面信息
     * @return
     */
    public List<DefaultDto.Comment> getComments(String body) {
        return Lists.newArrayList();
    }

    /**
     * 获取评论,由具体爬虫实现，默认返回空
     *
     * @param document 页面信息
     * @return
     */
    public List<DefaultDto.Comment> getComments(Document document) {
        return Lists.newArrayList();
    }

    /**
     * 爬取指定内容，异常时会降级为ContentSource.UNKNOWN类型的内容(仅源网页的连接、标题和icon)
     *
     * @param param
     * @return
     */
    public DefaultDto smashOnErrorWithUnknown(RequestParam param) {
        try {
            return unitedStatesSmash(param);
        } catch (Exception e) {
            //如果当前不是未知源爬取器则降级为未知源再尝试爬取一次，否则直接抛出异常
            AbstractSpider unknown = ContentSource.UNKNOWN.getSpider();
            if (getContentSource().getSpider() != unknown) {
                return unknown.unitedStatesSmash(param);
            }
            throw e;
        }
    }

    /**
     * 校验url
     *
     * @param url
     */
    protected abstract void checkHost(String url);

    /**
     * 参数校验
     *
     * @param param
     */
    protected void checkParam(RequestParam param) {
        if (param == null) {
            throw new MaiQuanSpiderException(ErrorCode.E001.getCode(), "请求参数不能为空");
        }
        if (StringUtils.isBlank(param.getUrl())) {
            throw new MaiQuanSpiderException(ErrorCode.E001.getCode(), "爬取地址不能为空");
        }
        //url地址校验
        Pattern pattern = Pattern.compile(URL_REGEX);
        String host = StringUtils.splitByWholeSeparator(param.getUrl(), "?")[0];
        if (!pattern.matcher(host).matches()) {
            throw new MaiQuanSpiderException(ErrorCode.E001.getCode(), "爬取地址不合法");
        }
        //host个性化校验
        checkHost(param.url);
        //是否开启代理
        if (param.isProxy() && StringUtils.isBlank(param.getHost())) {
            throw new MaiQuanSpiderException(ErrorCode.E001.getCode(), "代理模式下,代理host不能为空");
        }
        //是否添加证书
        if (param.isSSL() && param.getSslSocketFactory() == null) {
            throw new MaiQuanSpiderException(ErrorCode.E001.getCode(), "指定ssl证书时,sslSocketFactory不能为空");
        }
    }

    /**
     * 爬虫请求参数
     */
    public static class RequestParam {

        //url
        private String url;

        //请求方式：get/post等
        private Connection.Method method;

        private Map<String, String> headers;

        private Map<String, String> cookies;

        private Map<String, String> data;

        private String body;

        //是否需要代理，默认不需要
        private boolean isProxy;

        //代理host，isProxy为true是必填
        private String host;

        //代理端口，isProxy为true是必填
        private int port;

        //是否需要证书
        private boolean isSSL;

        //证书工厂，isSSL为true是必填
        private SSLSocketFactory sslSocketFactory;

        private boolean redirect;

        private RequestParam() {
            this.headers = Maps.newHashMap();
            this.headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36");
            this.redirect = true;
        }

        public RequestParam(String url) {
            this();
            this.url = url;
            this.method = Connection.Method.GET;
        }

        public RequestParam(String url, Connection.Method method) {
            this();
            this.url = url;
            this.method = method;
        }

        public boolean isRedirect() {
            return redirect;
        }

        public void setRedirect(boolean redirect) {
            this.redirect = redirect;
        }

        public String getUrl() {
            return url;
        }

        public void setUrl(String url) {
            this.url = url;
        }

        public Connection.Method getMethod() {
            return method;
        }

        public void setMethod(Connection.Method method) {
            this.method = method;
        }

        public Map<String, String> getHeaders() {
            return headers;
        }

        public void setHeaders(Map<String, String> headers) {
            this.headers = headers;
        }

        public Map<String, String> getCookies() {
            return cookies;
        }

        public void setCookies(Map<String, String> cookies) {
            this.cookies = cookies;
        }

        public Map<String, String> getData() {
            return data;
        }

        public void setData(Map<String, String> data) {
            this.data = data;
        }

        public String getBody() {
            return body;
        }

        public void setBody(String body) {
            this.body = body;
        }

        public boolean isProxy() {
            return isProxy;
        }

        public void setProxy(boolean proxy) {
            isProxy = proxy;
        }

        public String getHost() {
            return host;
        }

        public void setHost(String host) {
            this.host = host;
        }

        public int getPort() {
            return port;
        }

        public void setPort(int port) {
            this.port = port;
        }

        public boolean isSSL() {
            return isSSL;
        }

        public void setSSL(boolean ssl) {
            isSSL = ssl;
        }

        public SSLSocketFactory getSslSocketFactory() {
            return sslSocketFactory;
        }

        public void setSslSocketFactory(SSLSocketFactory sslSocketFactory) {
            this.sslSocketFactory = sslSocketFactory;
        }

        public void autoProxy() {
            Data5UProxyProvider.Proxy proxy = null;
            try {
                proxy = Data5UProxyProvider.getProxyFromData5u();
                this.setProxy(true);
                this.setHost(proxy.getHost());
                this.setPort(proxy.getPort());
            } catch (Exception e) {
                //do nothing
            }
        }
    }
}
