package cn.com.duiba.spider.util.maiquan.spider;

import cn.com.duiba.spider.util.maiquan.constant.ContentSource;
import cn.com.duiba.spider.util.maiquan.ImageUtil;
import cn.com.duiba.spider.util.maiquan.dto.DefaultDto;
import cn.com.duiba.spider.util.maiquan.exception.ErrorCode;
import cn.com.duiba.spider.util.maiquan.exception.MaiQuanSpiderException;
import cn.com.duiba.spider.util.maiquan.tts.TextUtil;
import cn.com.duiba.spider.util.maiquan.tts.baidu.keyword.BaiduKeywordUtil;
import cn.com.duiba.spider.util.maiquan.tts.baidu.summary.BaiduSummaryUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.google.common.collect.Lists;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Connection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.util.List;

/**
 * 美篇爬虫
 *
 * @author: sjx
 * @date: 2019-05-27 15:42
 */
public class MeiPianSpider extends AbstractSpider {

    /**
     * 评论url
     */
    private static final String COMMENT_URL = "https://poster.meipian.cn/4.5/comment/list";

    /**
     * 评论参数
     */
    private static final String COMMENT_PARAM = "{\"access\":\"1000|guest\",\"con\":\"comment\",\"fn\":\"listV2\",\"article_id\":\"%s\",\"wuser_id\":0,\"max_id\":0,\"size\":20}";

    private MeiPianSpider() {

    }

    @Override
    protected ContentSource getContentSource() {
        return ContentSource.MEIPIAN;
    }

    public static final MeiPianSpider SPIDER = new MeiPianSpider();

    @Override
    public DefaultDto unitedStatesSmash(RequestParam param) {
        Document document = doRequestDocument(param);
        Elements bodyDivs = document.body().select("div");
        Element articleMeta = bodyDivs.select(".article-meta").first();
        if (articleMeta == null) {
            throw new MaiQuanSpiderException(ErrorCode.E004.getCode(), "美篇文章标题获取失败");
        }
        Elements sections = bodyDivs.select(".content-container").select(".section");
        if (CollectionUtils.isEmpty(sections)) {
            throw new MaiQuanSpiderException(ErrorCode.E004.getCode(), "美篇文章内容获取失败");
        }
        DefaultDto defaultDto = new DefaultDto(param.getUrl(), this.getContentSource().getCode());
        Element authorInfo = bodyDivs.select(".pc_right_bar").select(".authermessage").first();
        String style = authorInfo.select(".autherheader").first().attr("style");
        String s = StringUtils.splitByWholeSeparator(style, "(")[1];
        String headImg = StringUtils.splitByWholeSeparator(s, ")")[0];
        String nickname = authorInfo.select(".authernickname").text();
        defaultDto.setAuthor(new DefaultDto.Author(nickname, headImg));
        List<String> imgList = Lists.newArrayList();
        defaultDto.setImageList(imgList);
        String title = articleMeta.select("h1").select(".title").html();
        defaultDto.setPostTitle(title);
        //获取id
        String path = StringUtils.splitByWholeSeparator(param.getUrl(), "//")[1];
        String[] strings = StringUtils.splitByWholeSeparator(path, "/");
        defaultDto.setSourceId(strings[strings.length - 1]);
        defaultDto.setVideoUrl("");
        StringBuilder sb = new StringBuilder(TextUtil.joinSplitor4Text(title)).append("\n");
        sections.forEach(section -> {
            Elements imgs = section.select(".img-box").select("img");
            imgs.forEach(img -> imgList.add(img.attr("src")));
            Elements texts = section.select(".text");
            if (CollectionUtils.isNotEmpty(texts)) {
                sb.append(TextUtil.joinSplitor4Text(texts.text())).append("\n");
            }
        });
        defaultDto.setSourceAbstract(BaiduSummaryUtil.summary4Text(title, sb.toString()));
        defaultDto.setImages(ImageUtil.listByUrls(imgList));
        defaultDto.setFullText(sb.toString());
        //添加评论
        try {
            RequestParam commentParam = new RequestParam(COMMENT_URL, Connection.Method.POST);
            commentParam.setBody(String.format(COMMENT_PARAM, defaultDto.getSourceId()));
            defaultDto.setComments(getComments(commentParam));
        } catch (Exception e) {
            //do nothing
        }
        try {
            defaultDto.setTags(BaiduKeywordUtil.tag4Text(defaultDto.getPostTitle(), defaultDto.getFullText()));
        } catch (IOException e) {
            //do nothing
        }
        return defaultDto;
    }

    @Override
    public List<DefaultDto.Comment> getComments(RequestParam param) {
        JSONObject json = doRequestJSON(param);
        JSONArray comments = json.getJSONArray("comments");
        List<DefaultDto.Comment> list = Lists.newArrayList();
        if (comments != null && !comments.isEmpty()) {
            for (int i = 0; i < comments.size(); i++) {
                JSONObject c = comments.getJSONObject(i);
                String text = c.getString("comment");
                String head = c.getString("head_img_url");
                String name = c.getString("nickname");
                DefaultDto.Comment comment = new DefaultDto.Comment(text, head, name);
                list.add(comment);
            }
        }
        return list;
    }

    @Override
    protected void checkHost(String url) {
        //do nothing
    }

    public static void main(String[] args) {
        DefaultDto defaultDto = ContentSource.MEIPIAN.getSpider().unitedStatesSmash(new RequestParam("https://www.meipian.cn/273gk4hd"));
        System.err.println(JSON.toJSONString(defaultDto));
    }
}
