Greasy Fork

Wenku Doc Downloader

下载“百度文库”文档,导出txt或pdf。“豆丁网”“爱问共享资料”(新浪文档)文档导出pdf。

目前为 2021-12-03 提交的版本。查看 最新版本

// ==UserScript==
// @name         Wenku Doc Downloader
// @namespace    http://tampermonkey.net/
// @version      0.7
// @description  下载“百度文库”文档,导出txt或pdf。“豆丁网”“爱问共享资料”(新浪文档)文档导出pdf。
// @author       [email protected]
// @match        https://wenku.baidu.com/view/*
// @match        https://www.docin.com/p-*
// @match        https://ishare.iask.sina.com.cn/f/*
// @icon         https://www.google.com/s2/favicons?domain=limestart.cn
// @grant        none
// @license      GPL-3.0-only
// @create       2021-11-22
// @note         现在支持爱问共享资料(新浪文档)文档导出pdf
// @note         优化了豆丁网的文档pdf导出
// ==/UserScript==

/*
*  附属功能函数部分
*/

function createAndDownloadFile(fileName, content) {
    // 创建并下载文件
    var aTag = document.createElement('a');
    var blob = new Blob([content]);
    aTag.download = fileName;
    aTag.href = URL.createObjectURL(blob);
    aTag.click();
    URL.revokeObjectURL(blob);
}

function formatText(text){
    // 用于纯文本文档的文本美化
	var reg_exp_1 = new RegExp(" [(]?=[\u4e00-\u9fa5] [)]");
	var reg_exp_2 = new RegExp("(?<=TEMP[\u4e00-\u9fa5]) ");
	var reg_exp_3 = new RegExp("(?<=[\u4e00-\u9fa5]) (?=[\u4e00-\u9fa5])");

	var text_1 = text.replace(reg_exp_1, "TEMP");
	var text_2 = text_1.replace(reg_exp_2, "");
	var text_3 = text_2.replace("TEMP", "");
	var text_final = text_3.replace(/ /g, " ");
	return text_final;
}

function formatText2(text) {
    // 用于图形文字混合型文档的文本美化
    var reg_exp = new RegExp("[  ]{2,}");
    var content_1 = text.replace(reg_exp, "\n");

    var content_2 = content_1.replace(/[  ]\n/g, "\n");

    var reg_exp_2 = new RegExp("\n[   ]*\n*\n");
    var content_3 = content_2.replace(reg_exp_2, "\n");

    var reg_exp_3 = new RegExp(" *\n * ");
    var content_4 = content_3.replace(reg_exp_3, "\n");

    var content_5 = content_4.replace(/[  ]/g, " ");
    var final_content = content_5.replace(/[ \n]精选文档[ \n]/g).replace(/\n{2,}/g, "\n");

    return final_content;
}

function detectType() {
    // 获取文档类型名称
    var doc_title_wrap = document.getElementsByClassName("doc-title-wrap")[0];
    var file_type = doc_title_wrap.children[0].className;
    var pdf, doc, ppt, excel, type;
    // 判断文档类型
    if (file_type.search("word") !== -1) {
        type = "word";
    }
    else if (file_type.search("ppt") !== -1) {
        type = "ppt";
    }
    else if (file_type.search("excel") !== -1) {
        type = "excel";
    }
    else if (file_type.search("pdf") !== -1) {
        type = "pdf";
    }
    else if (file_type.search("txt" !== -1)) {
        type = "txt";
    }
    else {
        type = file_type;
    }
    // 分别尝试获取相应元素列表,若列表长度为0则不存在相应元素,否则存在
    var pic_nums = document.getElementsByClassName("reader-pic-item").length;
    var word_nums = document.getElementsByClassName("reader-word-layer").length;
    var ppt_img_nums = document.getElementsByClassName("ppt-image-wrap").length;

    // 判断文档类型、文字和图片的数量状况
    if (type === "word" && !word_nums && pic_nums) {
        // doc: 纯图片
        return "doc-only-pic";
    }
    else if (type === "word" && word_nums > 2 && pic_nums <= 1) {
        // doc: 纯文字
        return "doc-only-word";
    }
    else if (type === "word" && pic_nums > 2 && word_nums > 2) {
        // doc: 图形、文字混合
        return "doc-pic-word";
    }
    else if (type === "pdf" && pic_nums > 2 && word_nums === 1) {
        // pdf: 带有一行文字标题,之后都是图形
        return "pdf-pic-title";
    }
    else if (type === "pdf" && !word_nums && pic_nums) {
        // pdf: 纯图形
        return "pdf-only-pic";
    }
    else if (type === "pdf" && !pic_nums && word_nums > 1) {
        // pdf: 纯文字
        return "pdf-only-word";
    }
    else if (type === "pdf" && word_nums > 2 && pic_nums > 1) {
        // pdf: 图形、文字混合
        return "pdf-pic-word";
    }
    else if ((type === "ppt" && ppt_img_nums > 2) || (type === "pdf" && !word_nums && !pic_nums && ppt_img_nums)) {
        // ppt: 包含至少3页内容 / 纯ppt图形页面构成
        return "ppt";
    }
    else if (type === "excel" && pic_nums && word_nums > 2) {
        // excel: 包含可选中文字
        return "excel-only-word";
    }
    else if (type === "excel" && pic_nums && !word_nums) {
        // excel: 纯图形
        return "excel-only-pic";
    }
    else if (type === "txt") {
        // txt: 纯文字
        return "txt";
    }
    else {
        return {"源文档类型": type,
                "图形数量": pic_nums,
                "文字块数量": word_nums,
                "ppt纯图形页面数量": ppt_img_nums};
    }
}

/*
*  主要功能函数部分
*/

function printPageDocin() {
    // 清理并打印豆丁网的文档页
    try {
        // 选择指针光标
        document.getElementById("j_select").click();
        // 移除无关页面元素
        var doc_head = document.getElementsByClassName("doc_header_mod")[0];
        var aside = document.getElementsByClassName("aside")[0];
        var no_more = document.getElementsByClassName("no_more_mod")[0];
        var like_too = document.getElementById("likeToo");
        var tools_bottom_bar = document.getElementsByClassName("tools_bottom_bar")[0];
        var page_crumbs = document.getElementsByClassName("page_crubms")[0];
        // 执行移除
        doc_head.remove();
        aside.remove();
        no_more.remove();
        like_too.remove();
        tools_bottom_bar.remove();
        page_crumbs.remove();
    }
    catch(e) {
        console.log("无须重复移除多余元素");
    }
    // 使文档居中
    var doc = document.getElementsByClassName("main")[0];
    doc.style.marginLeft = "6%";
    // 隐藏按钮,然后打印页面
    var btn_2 = document.getElementsByClassName("save-doc-btn")[0];
    btn_2.remove();
    // 打印结束,显示按钮
    window.print();
    var after_page = document.getElementsByClassName("reader_tools_bar_wrap")[0];
    after_page.appendChild(btn_2);
}

function printPageiShare() {
    // 清理并打印爱问共享资料的文档页
    try {
        // 移除无关页面元素
        var topbanner = document.getElementsByClassName("detail-topbanner")[0];
        var header = document.getElementsByClassName("new-detail-header")[0];
        var fixright = document.getElementById("fix-right");
        var redpacket = document.getElementsByClassName("loginRedPacket-dialog")[0];
        var fixedrightfull = document.getElementsByClassName("fixed-right-full")[0];
        var footer = document.getElementsByClassName("website-footer")[0];
        var guess = document.getElementsByClassName("guess-you-like-warpper")[0];
        var detailtopbox = document.getElementsByClassName("detail-top-box")[0];
        var fullscreen = document.getElementsByClassName("reader-fullScreen")[0];
        var endhint = document.getElementsByClassName("endof-trial-reading")[0];
        var crumb_arrow = document.getElementsByClassName("crumb-arrow")[0];
        // 执行移除
        topbanner.remove();
        header.remove();
        fixright.remove();
        redpacket.remove();
        fixedrightfull.remove();
        footer.remove();
        guess.remove();
        detailtopbox.remove();
        fullscreen.remove();
        endhint.remove();
        crumb_arrow.parentElement.remove();
        // 移除页面浏览状态按钮
        var state_btn = document.getElementsByClassName("state-bottom")[0];
        state_btn.remove();
    }
    catch(e) {
        console.log("无须重复移除多余元素");
    }
    // 使文档居中
    var doc_main = document.getElementsByClassName("doc-main")[0];
    var offset = window.prompt("请输入偏移百分位:", "10");
    // 如果输入的数字不在 1-59 内,提醒用户重新设置
    if (offset.length === 1 && offset.search(/[1-9]/) !== -1) {
        doc_main.style.marginLeft = offset + "%";
    }
    else if (offset.length === 2 && offset.search(/[1-5][0-9]/) !== -1) {
        doc_main.style.marginLeft = offset + "%";
    }
    else {
        alert("请输入一个正整数,范围在1至59之间,用来使文档居中\n(不同文档偏移量不同,所以需要手动调整)");
        return;
    }
    // 隐藏按钮,然后打印页面
    var btn_2 = document.getElementsByClassName("save-doc-btn")[0];
    btn_2.style.display = "none";
    window.print();
    // 打印结束,显示按钮
    btn_2.style.removeProperty("display");
}

function readAlliShare() {
    var red_btn = document.getElementsByClassName("red-color")[0];
    var red_text = red_btn.textContent;
    // 如果可以展开,则展开
    if (red_text.search("点击可继续阅读") !== -1) {
        red_btn.click();
    }
    // 否则启动按钮2,准备清理页面然后打印为PDF
    else {
        var hint = "文档已经完全展开,可以导出";
        alert(hint);
        // 准备调整按钮,先获取按钮
        var init_btn = document.getElementsByClassName("init-btn")[0];
        var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];
        // 调整按钮显示状况
        save_doc_btn.style.removeProperty("display");
        init_btn.style.display = "none";
    }
}

function saveHtml() {
    // 提示用户保存完整html页面
    var hint1 = "请按下Ctrl+S以保存页面\n";
    var hint2 = "请保存【网页,全部】或【网页,完成】而非仅HTML或单个文件\n";
    var hint3 = "保存后应当有1个【xxx.html】文件和1个【xxx_files】文件夹\n";
    var hint4 = "请复制这两个文件到【HTML转PDF】程序所在的文件夹"
    alert(hint1 + hint2 + hint3 + hint4);
}

function createSaveHtmlBtn() {
    // 创建 下载html 按钮
    var btn_3 = document.createElement("button");
    // 样式设定
    btn_3.setAttribute("class", "save-html-btn");
    btn_3.style.height = "25px";
    btn_3.style.width = "15%";
    btn_3.style.marginLeft = "0.2%";
    btn_3.style.backgroundColor = "orange";
    btn_3.style.border = "none";
    btn_3.textContent = "导出pdf(实验性)";
    btn_3.style.fontWeight = "bold";
    btn_3.style.borderRadius = "10%";
    // 绑定事件,添加到页面上
    btn_3.onclick = saveHtml;
    var section = document.getElementsByClassName("btns_section")[0];
    section.appendChild(btn_3);
}

function readAll() {
    var read_all_btn = document.getElementsByClassName("read-all")[0];
    // 如果存在“继续阅读”的按钮
    if (read_all_btn) {
        // 点击“继续阅读”按钮
        read_all_btn.click();
    }
    else{
        var hint = "文档已经完全展开,可以导出";
        alert(hint);
        try {
            // 判断文档类型
            var category = detectType();
        }
        catch {
            alert("未知/特殊文档类型,例如学术文献,暂不支持下载\n也可与作者反馈或联系:\[email protected]");
            return undefined;
        }
        // 准备调整按钮,先获取按钮
        var init_btn = document.getElementsByClassName("init-btn")[0];
        var save_doc_btn = document.getElementsByClassName("save-doc-btn")[0];

        // 根据文档类型判断是否要增加“导出pdf”橙色按钮
        if (category === "doc-only-word" ||
                category === "doc-pic-word" ||
                category === "pdf-only-word" ||
                category === "pdf-pic-word" ||
                category === "excel-only-word") {
            // 非纯图片文档可以使用html转pdf的功能(excel不行)
            save_doc_btn.style.width = "34.8%";
            createSaveHtmlBtn();
        }
        // 根据文档类型判断是否要更换绿色按钮的文字
        else if (category === "doc-pic-only" ||
                category === "pdf-pic-title" ||
                category === "ppt" ||
                category === "pdf-only-pic" ||
                category === "excel-only-pic"){
            save_doc_btn.textContent = "导出全部图片链接";
        }
        // 调整按钮显示状况
        save_doc_btn.style.removeProperty("display");
        init_btn.style.display = "none";
    }
}

function savePDFData() {
    // 存储pdf型data(假定是内容是pic)
    // alert("Function savePDFData was called.");
    var pic_urls = document.getElementsByClassName("reader-pic-item");
    var text_list = [];
    // 去掉前缀
    var reg_exp_1 = new RegExp(": ?url[(]");
    // 去掉后缀
    var reg_exp_2 = new RegExp("[)]; ?background-position");

    for (var i = 0; i < pic_urls.length; i++){
        var whole_text = pic_urls[i].getAttribute("style");
        var de_pretext = whole_text.split(reg_exp_1)[1];
        var url = de_pretext.split(reg_exp_2)[0];
        text_list.push(url);
    }

    text_list[0] = text_list[0].replace(/"/g, "");
    var content = text_list.join("\n");
    // 启动下载
    createAndDownloadFile("urls.csv", content);
}

function saveDocData() {
    // 存储doc型data(内容是text)
    // alert("Function saveDocData was called.");
    // 获取文本
	var text_elements = document.getElementsByClassName("reader-word-layer");
	var texts = [];
	for (var elem of text_elements){
		texts.push(elem.textContent);
	}
	// 美化后导出文本
	var origin_content = texts.join("");
	var content = formatText(origin_content);
	createAndDownloadFile("纯文本文档.txt", content);
}

function savePPTData() {
    // 存储ppt型data(内容是pic)
    // alert("Function savePPTData was called.");
    var pic_elements = document.getElementsByClassName("ppt-image-wrap");
    var pic_urls = [];

    for (var elem of pic_elements) {
        var pic_obj = elem.children[0];
        var url = pic_obj.src;
        pic_urls.push(url);
    }
    var content = pic_urls.join("\n");
    // 启动下载
    createAndDownloadFile("urls.csv", content);
}

function saveExcelData() {
    // 1. 拿到表格
    var table_pic = document.getElementsByClassName("reader-pic-item")[0];
    var url = table_pic.style.getPropertyValue("background-image");
    // 获取图片地址
    var pure_url = url.slice(5, -2);

    // 2. 拿到表格内文字信息
    var text_elems = document.getElementsByClassName("reader-word-layer");
    var text_list = [];
    for (var elem of text_elems) {
    	text_list.push(elem.textContent);
    }
    var _text = text_list.join("\n");
    // 替换奇怪的空格
    var text = _text.replace(/ /g, " ");

    // 3. 合并至一个字符串,然后导出
    var head = "表格图形链接如下(复制到浏览器中打开):";
    var content = head + "\n\n" + pure_url + "\n\n" + text;
    createAndDownloadFile("图片地址和表格内容.txt", content);
}

function saveDocAndPicData() {
    // 对于文字和图形混合型的data只能存储其中的纯文字
    // alert("Function saveDocAndPicData was called.");
    // 获取文本
	var text_elements = document.getElementsByClassName("reader-word-layer");
	var texts = [];
	for (var elem of text_elements){
		texts.push(elem.textContent);
	}
    var origin_content = texts.join("");
	// 美化后导出文本
	var content = formatText2(origin_content);
	createAndDownloadFile("纯文本文档.txt", content);
}

function saveTxtData() {
    // 存储纯文本到本地
    var text_elements = document.getElementsByClassName("p-txt");
	var texts = [];
	for (var elem of text_elements){
		texts.push(elem.textContent);
	}
    var content = texts.join("");
	createAndDownloadFile("纯文本文档.txt", content);
}

function saveData() {
    // 存储文档数据到本地
    var category = detectType();
    if (category === "doc-pic-only" ||
             category === "pdf-pic-title" ||
             category === "pdf-only-pic" ||
             category === "excel-only-pic"){
        // 对于纯图形文档,都用【图片下载合并器】来处理
        savePDFData();
    }
    else if (category === "doc-only-word" ||
             category === "doc-pic-word" ||
             category === "pdf-only-word" ||
             category === "pdf-pic-word") {
        // 对于包含大量文字、且非表格的文档,直接提出纯文本
        saveDocData();
    }
    else if (category === "ppt") {
        // ppt按类似于纯图文档的方法处理
        savePPTData();
    }
    else if (category === "excel-only-word") {
        // excel仅保存其中的纯文字
        saveExcelData();
    }
    else if (category === "txt") {
        // txt直接保存
        saveTxtData();
    }
    else {
        var info = [];
        for (var key in category){
            info.push(key + " : " + category[key]);
        }
        alert("未知处理类型,请反馈或联系作者:\[email protected]\n" + info.join("\n"));
    }
}

function create2btns() {
    // 创建两个初始按钮:展开文档、存储文档

    // 创建脚本启动按钮1、2
    var btn_1 = document.createElement("button");
    var btn_2 = document.createElement("button");

    // 设定按钮1、2样式
    btn_1.setAttribute("class", "init-btn");
    btn_1.style.height = "25px";
    btn_1.style.width = "50%";
    btn_1.style.marginLeft = "25%";
    btn_1.style.border = "none";
    btn_1.style.backgroundColor = "blue";
    btn_1.style.color = "white";
    btn_1.style.fontWeight = "bold";
    btn_1.textContent = "展开文档";

    btn_2.setAttribute("class", "save-doc-btn");
    btn_2.style.height = "25px";
    btn_2.style.width = "50%";
    btn_2.style.marginLeft = "25%";
    btn_2.style.backgroundColor = "green";
    btn_2.style.border = "none";
    btn_2.style.display = "none";
    btn_2.style.color = "white";
    btn_2.style.fontWeight = "bold";

    // 添加按钮元素到页面
    var section = document.createElement("section");
    section.setAttribute("class", "btns_section");
    section.appendChild(btn_1);
    section.appendChild(btn_2);
    document.body.appendChild(section);
    // 返回元素引用
    return [btn_1, btn_2]
}

/*
*  主函数部分
*/

function baiduWenku() {
    // 创建脚本启动按钮1、2
    var [btn_1, btn_2] = create2btns();
    btn_2.textContent = "导出纯文本";

    // 绑定主函数
    btn_1.onclick = readAll;
    btn_2.onclick = saveData;
}

function docin() {
    // 创建脚本启动按钮
    var [btn_1, btn_2] = create2btns();
    btn_1.remove();
    btn_2.textContent = "打印页面到PDF";
    btn_2.style.removeProperty("display");
    // 绑定主函数
    btn_2.onclick = printPageDocin;
}

function ishare() {
    // 创建脚本启动按钮1、2
    var [btn_1, btn_2] = create2btns();
    btn_2.textContent = "打印页面到PDF";

    // 绑定主函数
    btn_1.onclick = readAlliShare;
    btn_2.onclick = printPageiShare;

    // 移除底部下载条
    var detailfixed = document.getElementsByClassName("detail-fixed")[0];
    detailfixed.remove();
}

function main() {
    var host = window.location.host;
    if (host === "wenku.baidu.com") {
        baiduWenku();
    }
    else if (host === "www.docin.com") {
        docin();
    }
    else if (host === "ishare.iask.sina.com.cn") {
        ishare();
    }
    else {
        console.log("匹配到了无效网页");
    }
}

main();