package cn.com.duiba.spider.util.maiquan.spider;

import cn.com.duiba.spider.util.maiquan.constant.ContentSource;
import cn.com.duiba.spider.util.maiquan.ImageUtil;
import cn.com.duiba.spider.util.maiquan.dto.DefaultDto;
import cn.com.duiba.wolf.utils.UUIDUtils;
import com.alibaba.fastjson.JSON;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

/**
 * 未知来源爬虫,仅爬取网页的icon和title
 *
 * @author: sjx
 * @date: 2019-05-27 15:47
 */
public class UnknownSpider extends AbstractSpider {
    private static final String DEFAULT_ICON = "https://yun.duiba.com.cn/magic/link_icon.png";

    private static final String IXIGUA_ICON = "https://yun.duiba.com.cn/magic/ixigua.png";
    private static final String ZHIHU_ICON = "https://yun.duiba.com.cn/magic/zhihu.png";
    private static final String TANGDOU_ICON = "https://yun.duiba.com.cn/magic/tangdou.png";
    private static final String TOUTIAO_ICON = "https://yun.duiba.com.cn/magic/toutiao.png";


    private UnknownSpider() {
    }

    @Override
    protected ContentSource getContentSource() {
        return ContentSource.UNKNOWN;
    }

    public static final UnknownSpider SPIDER = new UnknownSpider();

    @Override
    public DefaultDto unitedStatesSmash(RequestParam param) {
        Document document = doRequestDocument(param);
        DefaultDto defaultDto = new DefaultDto(param.getUrl(), this.getContentSource().getCode());
        defaultDto.setPostTitle(document.title());
        Elements imgs = document.select("img");
        if (CollectionUtils.isEmpty(imgs)) {
            defaultDto.setImageList(Lists.newArrayList(getIconByHost(param.getUrl())));
        } else {
            for (int i = 0; i < imgs.size(); i++) {
                String src = imgs.get(i).attr("src");
                if (StringUtils.startsWithIgnoreCase(src, "//")) {
                    src = "http:" + src;
                }
                DefaultDto.Image byUrl = ImageUtil.getByUrl(src);
                if (byUrl != null) {
                    defaultDto.setImageList(Lists.newArrayList(byUrl.getUrl()));
                    break;
                }
                if (i >= 2) {
                    break;
                }
            }
            if (CollectionUtils.isEmpty(defaultDto.getImageList())) {
                defaultDto.setImageList(Lists.newArrayList(getIconByHost(param.getUrl())));
            }
        }
        defaultDto.setSourceId(UUIDUtils.createUUID());
        if (StringUtils.isBlank(defaultDto.getPostTitle())) {
            defaultDto.setPostTitle(param.getUrl());
        }
        return defaultDto;
    }

    @Override
    protected void checkHost(String url) {
        //do nothing
    }

    private String getIconByHost(String url) {
        String host = StringUtils.splitByWholeSeparator(url, "?")[0];
        if (StringUtils.containsIgnoreCase(host, "ixigua.com")) {
            return IXIGUA_ICON;
        }
        if (StringUtils.containsIgnoreCase(host, "zhihu.com")) {
            return ZHIHU_ICON;
        }
        if (StringUtils.containsIgnoreCase(host, "tangdou.com")) {
            return TANGDOU_ICON;
        }
        if (StringUtils.containsIgnoreCase(host, "toutiao.com")) {
            return TOUTIAO_ICON;
        }
        return DEFAULT_ICON;
    }

    public static void main(String[] args) {
        DefaultDto defaultDto = ContentSource.UNKNOWN.getSpider().unitedStatesSmash(new RequestParam("https://www.zhihu.com/question/300735799/answer/624840650"));
        System.err.println(JSON.toJSONString(defaultDto));
    }
}
