Greasy Fork

MagicScraper

Scrapes and displays data from the web page based on rules.

目前为 2023-07-20 提交的版本。查看 最新版本

此脚本不应直接安装,它是一个供其他脚本使用的外部库。如果您需要使用该库,请在脚本元属性加入:// @require https://update.greasyfork.cloud/scripts/471264/1222752/MagicScraper.js

// ==UserScript==
// @name         MagicScraper
// @namespace    http://tampermonkey.net/
// @version      0.1
// @description  Scrapes and displays data from the web page based on rules.
// @author       aolko
// @match        *://*/*
// @grant        GM_addStyle
// ==/UserScript==

function magicScraper(rules, options = {}) {
  function createDOMFromScrapedData(data, element, keepChildren) {
    for (const key in data) {
      if (typeof data[key] === 'string') {
        const newElement = document.createElement(element.tagName);
        newElement.innerHTML = data[key];

        if (keepChildren && element.children.length > 0) {
          Array.from(element.children).forEach(child => {
            newElement.appendChild(child);
          });
        }

        element.appendChild(newElement);
      } else if (typeof data[key] === 'object') {
        const newElement = document.createElement(element.tagName);
        element.appendChild(newElement);
        createDOMFromScrapedData(data[key], newElement, keepChildren);
      }
    }
  }

  function matchPageOrDomain(pattern, current) {
    if (pattern === '*' || pattern === current) return true;
    const regexPattern = new RegExp(`^${pattern.replace('*', '.*')}$`, 'i');
    return regexPattern.test(current);
  }

  function scrapeDataByRules(rulesObj, currentDomain, currentPage) {
    const domainKeys = Object.keys(rulesObj);
    let domainData = {};
    let pageData = {};

    for (const domainPattern of domainKeys) {
      if (matchPageOrDomain(domainPattern, currentDomain)) {
        domainData = rulesObj[domainPattern];
        break;
      }
    }

    if (currentPage && domainData.pages) {
      pageData = domainData.pages[currentPage] || {};
    }

    return Object.assign({}, domainData, pageData);
  }

  function loadExternalRules(externalRulesURL, currentDomain, currentPage, callback) {
    fetch(externalRulesURL)
      .then(response => response.json())
      .then(data => {
        const rulesObj = data.rules || {};
        const scrapedData = scrapeDataByRules(rulesObj, currentDomain, currentPage);
        callback(scrapedData);
      })
      .catch(err => {
        console.error('Error loading external rules:', err);
        callback({});
      });
  }

  function runScraping() {
    const currentDomain = window.location.hostname;
    const currentPage = window.location.pathname;

    let pageRules;

    if (typeof rules === 'string') {
      // Load external rules if the rules parameter is a URL string
      loadExternalRules(rules, currentDomain, currentPage, scrapedData => {
        pageRules = scrapedData;
        handleRules(pageRules);
      });
    } else {
      pageRules = scrapeDataByRules(rules, currentDomain, currentPage);
      handleRules(pageRules);
    }
  }

  function handleRules(pageRules) {
    if (Object.keys(pageRules).length === 0) {
      console.warn('No rules found for the current domain and page.');
      return;
    }

    const fragment = document.createDocumentFragment();
    createDOMFromScrapedData(pageRules, fragment, options.keepChildren);
    document.body.innerHTML = '';
    document.body.appendChild(fragment);
  }

  runScraping();
}