flyscrape

<picture> <source media="(prefers-color-scheme: dark)" srcset=".github/assets/logo-alt.png"> <source media="(prefers-color-scheme: light)" srcset=".github/assets/logo.png"> <img width="200" src="https://yellow-cdn.veclightyear.com/2b54e442/2a9e1264-3c4b-43aa-8387-32f3f85c15bd.png"> </picture> Flyscrape 是一款为不具备高级编程技能的用户设计的命令行网页抓取工具， 可以精确提取网站数据。 <a href="#安装">安装</a> · <a href="https://flyscrape.com/docs/getting-started">文档</a> · <a href="https://github.com/philippta/flyscrape/releases">发布版本</a>

演示

特性

独立运行: Flyscrape 以单个可执行二进制文件的形式提供。
类似 jQuery: 使用熟悉的 API 从 HTML 页面提取数据。
可脚本化: 使用 JavaScript 编写数据提取逻辑。
系统 Cookie: 允许 Flyscrape 访问您的浏览器 cookie 存储。
浏览器模式: 使用无头浏览器渲染 JavaScript 密集型页面。

示例

这个示例抓取 Hacker News 的前几页，特别是 New、Show 和 Ask 部分。

export const config = {
    urls: [
        "https://news.ycombinator.com/new",
        "https://news.ycombinator.com/show",
        "https://news.ycombinator.com/ask",
    ],

    // 缓存请求以供后续使用。
    cache: "file",

    // 启用 JavaScript 渲染。
    browser: true,
    headless: false,

    // 跟随分页链接 5 次。
    depth: 5,
    follow: ["a.morelink[href]"],
}

export default function ({ doc, absoluteURL }) {
    const title = doc.find("title");
    const posts = doc.find(".athing");

    return {
        title: title.text(),
        posts: posts.map((post) => {
            const link = post.find(".titleline > a");

            return {
                title: link.text(),
                url: link.attr("href"),
            };
        }),
    }
}

$ flyscrape run hackernews.js
[
  {
    "url": "https://news.ycombinator.com/new",
    "data": {
      "title": "New Links | Hacker News",
      "posts": [
        {
          "title": "Show HN: flyscrape - An standalone and scriptable web scraper",
          "url": "https://flyscrape.com/"
        },
        ...
      ]
    }
  }
]

查看 examples 文件夹获取更详细的示例。

安装

Homebrew

对于 macOS 用户，flyscrape 也可以通过 homebrew 安装：

brew install flyscrape

预编译二进制文件

flyscrape 为 MacOS、Linux 和 Windows 提供可下载的二进制文件，可从发布页面获取。

从源代码编译

要从源代码编译 flyscrape，请按以下步骤操作：

安装 Go：确保您的系统已安装 Go。如果没有，可以从 https://go.dev/ 下载。

安装 flyscrape：打开终端并运行以下命令：

go install github.com/philippta/flyscrape/cmd/flyscrape@latest

使用方法

用法：

    flyscrape run 脚本 [配置标志]

示例：

    # 运行脚本。
    $ flyscrape run example.js

    # 将 URL 作为参数设置。
    $ flyscrape run example.js --url "http://other.com"

    # 启用代理支持。
    $ flyscrape run example.js --proxies "http://someproxy:8043"

    # 跟随分页链接。
    $ flyscrape run example.js --depth 5 --follow ".next-button > a"

    # 将输出格式设置为 ndjson。
    $ flyscrape run example.js --output.format ndjson

    # 将输出写入文件。
    $ flyscrape run example.js --output.file results.json

配置

以下是一个展示 flyscrape 功能的抓取脚本示例。有关所有配置选项的完整文档，请访问文档页面。

export const config = {
    // 指定开始抓取的 URL。
    url: "https://example.com/",

    // 指定多个开始抓取的 URL。                             (默认 = [])
    urls: [                          
        "https://anothersite.com/",
        "https://yetanother.com/",
    ],

    // 启用无头浏览器渲染。                                 (默认 = false)
    browser: true,

    // 指定浏览器是否应该是无头模式。                       (默认 = true)
    headless: false,

    // 指定应该跟随链接的深度。                             (默认 = 0，不跟随)
    depth: 5,                        

    // 指定要跟随的 CSS 选择器。                            (默认 = ["a[href]"])
    follow: [".next > a", ".related a"],                      
 
    // 指定允许的域名。['*'] 表示所有。                     (默认 = url 中的域名)
    allowedDomains: ["example.com", "anothersite.com"],              
 
    // 指定阻止的域名。                                     (默认 = 无)
    blockedDomains: ["somesite.com"],              

    // 指定允许的 URL 正则表达式。                          (默认 = 全部允许)
    allowedURLs: ["/posts", "/articles/\d+"],                 
 
    // 指定阻止的 URL 正则表达式。                          (默认 = 无)
    blockedURLs: ["/admin"],                 
   
    // 指定每分钟请求的速率。                               (默认 = 无速率限制)
    rate: 60,                       

    // 指定并发请求数。                                     (默认 = 无限制)
    concurrency: 1,                       

    // 指定单个 HTTP(S) 代理 URL。                          (默认 = 无代理)
    // 注意：与浏览器模式不兼容。
    proxy: "http://someproxy.com:8043",

    // 指定多个 HTTP(S) 代理 URL。                          (默认 = 无代理)
    // 注意：与浏览器模式不兼容。
    proxies: [
      "http://someproxy.com:8043",
      "http://someotherproxy.com:8043",
    ],                     

    // 启用基于文件的请求缓存。                             (默认 = 无缓存)
    cache: "file",                   

    // 指定 HTTP 请求头。                                   (默认 = 无)
    headers: {                       
        "Authorization": "Bearer ...",
        "User-Agent": "Mozilla ...",
    },

    // 使用本地浏览器的 cookie 存储。                       (默认 = 关闭)
    // 选项："chrome" | "edge" | "firefox"
    cookies: "chrome",

    // 指定输出选项。
    output: {
        // 指定输出文件。                                   (默认 = stdout)
        file: "results.json",
        
        // 指定输出格式。                                   (默认 = json)
        // 选项："json" | "ndjson"
        format: "json",
    },
};

export default function ({ doc, url, absoluteURL }) {
    // doc              - 包含解析后的 HTML 文档
    // url              - 包含抓取的 URL
    // absoluteURL(...) - 将相对 URL 转换为绝对 URL
}

查询 API

// <div class="element" foo="bar">Hey</div>
const el = doc.find(".element")
el.text()                                 // "Hey"
el.html()                                 // `<div class="element">Hey</div>`
el.attr("foo")                            // "bar"
el.hasAttr("foo")                         // true
el.hasClass("element")                    // true

// <ul>
//   <li class="a">Item 1</li>
//   <li>Item 2</li>
//   <li>Item 3</li>
// </ul>
const list = doc.find("ul")
list.children()                           // [<li class="a">Item 1</li>, <li>Item 2</li>, <li>Item 3</li>]

const items = list.find("li")
items.length()                            // 3
items.first()                             // <li>Item 1</li>
items.last()                              // <li>Item 3</li>
items.get(1)                              // <li>Item 2</li>
items.get(1).prev()                       // <li>Item 1</li>
items.get(1).next()                       // <li>Item 3</li>
items.get(1).parent()                     // <ul>...</ul>
items.get(1).siblings()                   // [<li class="a">Item 1</li>, <li>Item 2</li>, <li>Item 3</li>]
items.map(item => item.text())            // ["Item 1", "Item 2", "Item 3"]
items.filter(item => item.hasClass("a"))  // [<li class="a">Item 1</li>]

Flyscrape API

文档解析

import { parse } from "flyscrape";

const doc = parse(`<div class="foo">bar</div>`);
const text = doc.find(".foo").text();

文件下载

import { download } from "flyscrape/http";

download("http://example.com/image.jpg")              // 下载为 "image.jpg"
download("http://example.com/image.jpg", "other.jpg") // 下载为 "other.jpg"
download("http://example.com/image.jpg", "dir/")      // 下载为 "dir/image.jpg"

// 如果服务器通过 Content-Disposition 头提供文件名，且未提供目标文件名，
// Flyscrape 将遵循建议的文件名。
// 例如 `Content-Disposition: attachment; filename="archive.zip"`
download("http://example.com/generate_archive.php", "dir/") // 下载为 "dir/archive.zip"