package cn.com.duiba.spider.util.maiquan.spider;

import cn.com.duiba.spider.util.maiquan.constant.ContentSource;
import cn.com.duiba.spider.util.maiquan.ImageUtil;
import cn.com.duiba.spider.util.maiquan.dto.DefaultDto;

import cn.com.duiba.spider.util.maiquan.exception.ErrorCode;
import cn.com.duiba.spider.util.maiquan.exception.MaiQuanSpiderException;
import cn.com.duiba.spider.util.maiquan.tts.TextUtil;
import cn.com.duiba.spider.util.maiquan.tts.baidu.keyword.BaiduKeywordUtil;
import cn.com.duiba.spider.util.maiquan.tts.baidu.summary.BaiduSummaryUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.google.common.collect.Lists;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;

/**
 * 公众号爬虫,目前爬取微小宝内容
 *
 * @author: sjx
 * @date: 2019-05-27 15:48
 */
public class GongZhongHaoSpider extends AbstractSpider {

    private static final String WXB_USER_INFO_URL = "https://data.wxb.com/account/index/%s?is_new=1";

    private GongZhongHaoSpider() {
    }

    @Override
    public ContentSource getContentSource() {
        return ContentSource.GONGZHONGHAO;
    }

    public static final GongZhongHaoSpider SPIDER = new GongZhongHaoSpider();

    @Override
    public DefaultDto unitedStatesSmash(RequestParam param) {
        DefaultDto defaultDto = new DefaultDto(param.getUrl(), this.getContentSource().getCode());
        Document document = doRequestDocument(param);
        Elements bodyDivs = document.body().select("div");
        Elements titleElement = bodyDivs.select(".rich_media_title").select("#activity-name");
        if (CollectionUtils.isEmpty(titleElement)) {
            throw new MaiQuanSpiderException(ErrorCode.E004.getCode(), "公众号文章标题获取失败");
        }
        String title = titleElement.text();
        defaultDto.setPostTitle(title);
        defaultDto.setSourceId(DigestUtils.md5Hex(title));
        Elements select = bodyDivs.select(".rich_media_content");
        if (CollectionUtils.isEmpty(select)) {
            throw new MaiQuanSpiderException(ErrorCode.E004.getCode(), "公众号文章内容获取失败");
        }
        String authorName = bodyDivs.select(".profile_nickname").text();
        String wxName = document.toString().split("var user_name = \"")[1].split("\";")[0];
        String headImg = getHeadImg(wxName);
        defaultDto.setAuthor(new DefaultDto.Author(authorName, headImg));
        Element content = select.get(0);
        //过滤<img> 标签长宽过小的
        List<String> imgs = content.select("img").stream()
                .filter(o -> {
                    try {
                        int width = Integer.valueOf(StringUtils.replace(o.attr("width"), "px", ""));
                        if (width - 300 < 0) {
                            return false;
                        }
                    } catch (Exception e) {
                        //do nothing
                    }
                    return true;
                }).map(o -> o.attr("data-src"))
                .filter(s -> !StringUtils.containsIgnoreCase(s, "mmbiz_gif") && !StringUtils.containsIgnoreCase(s, "wx_fmt=gif"))
                .collect(Collectors.toList());
        defaultDto.setImageList(imgs);
        //过滤图片真实大小过小的
        List<DefaultDto.Image> images = ImageUtil.listByUrls(imgs);
        defaultDto.setImages(images);
        if (imgs.size() != images.size()) {
            defaultDto.setImageList(images.stream().map(DefaultDto.Image::getUrl).collect(Collectors.toList()));
        }
        List<String> list = Lists.newArrayList();
        text4Elements(select, list);
        StringBuilder sb = new StringBuilder(title).append("。");
        list.forEach(s -> sb.append(s).append("\n"));
        defaultDto.setFullText(sb.toString());
        defaultDto.setSourceAbstract(BaiduSummaryUtil.summary4Text(title, sb.toString()));
        try {
            defaultDto.setTags(BaiduKeywordUtil.tag4Text(defaultDto.getPostTitle(), defaultDto.getFullText()));
        } catch (IOException e) {
            //do nothing
        }
        return defaultDto;
    }

    private String getHeadImg(String wxName) {
        //从微小宝查询用户信息
        if (StringUtils.isNotBlank(wxName)) {
            //通过微小宝查询微信号头像
            String userInfoUrl = String.format(WXB_USER_INFO_URL, wxName);
            JSONObject jsonObject = doRequestJSON(new RequestParam(userInfoUrl));
            return jsonObject.getJSONObject("data").getString("avatar");
        }
        throw new RuntimeException("头像获取失败");
    }

    private void text4Elements(Elements e, List<String> texts) {
        for (Element element : e) {
            String text = element.text();
            if (StringUtils.isNotBlank(text)) {
                texts.add(TextUtil.joinSplitor4Text(text));
            }
        }
    }

    @Override
    protected void checkHost(String url) {
    }

    public static void main(String[] args) {
        DefaultDto defaultDto = SPIDER.unitedStatesSmash(new RequestParam("https://mp.weixin.qq.com/s/NMocN_ciG9p6o0IVBZbiEQ"));
        //DefaultDto defaultDto = ContentSource.GONGZHONGHAO.getSpider()
        //        .unitedStatesSmash(new RequestParam("https://mp.weixin.qq.com/s?src=11&timestamp=1562895002&ver=1723&signature=KB461bV1GGxXs0eLYey2gWBDgyZcn2okjl4l6atOKi5-jvbJ95dbOugP213zo9C0zUNvbkHPYIekYTHO89TdbbboJUceAS1evhQrNtOseZpbt3ZVRYSml-LKRzohrDba&new=1"));
        defaultDto.setFullText("");
        System.err.println(JSON.toJSONString(defaultDto));
    }
}
