利用puppeteer采集自己想要的书籍

发布时间:2021-03-28编辑:RainNight阅读(1608)

    安装node


    cd /usr/local/src/
    wget http://nodejs.org/dist/v0.10.24/node-v0.10.24.tar.gz
    tar zxvf node-v0.10.24.tar.gz
    cd node-v0.10.24
    ./configure --prefix=/usr/local/node/0.10.24
    make
    make install
    node -v
    

    file

    安装完node之后npm就安装好了

    vagrant@homestead:~/code/Ecc3.0_System01$ npm -v
    

    file

    切换cnpm国内源


    $ npm install -g cnpm --registry=https://registry.npm.taobao.org
    

    安装puppeteer


    $ cnpm i puppeteer
    

    测试脚本


    const puppeteer = require('puppeteer');  
    
    (async () => {
        try {
            const browser = await puppeteer.launch();
            const page = await browser.newPage();
            await page.goto('https://www.baidu.com/');
        
            // 等待时间
            await page.waitFor(1000*2);
        
            // 截图
            await page.screenshot({path:'./output/baidu.png',fullPage:true});
        
            // pdf
            await page.pdf({path:'./output/baidu.pdf',format:"A4",printBackground:true});
        
            // 输入内容
            await page.type('#kw','Python',{delay:true});
            // 触发内容
            await page.click('#su');
        
            // 等待时间
            await page.waitFor(1000*5);
        
            await page.setViewport({
                width:1920,
                height:1080
            });
        
            // 截图
            await page.screenshot({path:'./output/baidu_python.png',fullPage:true});
        
            // pdf
            await page.pdf({path:'./output/baidu_python.pdf',format:"a4",printBackground:true});
        
            await browser.close();
        } catch (error) {
            console.log(`this is the ${error}`);
    
        }       
    })(); 
    

    file

    结果


    file

    批量下载 Es6文档


    const puppeteer = require("puppeteer");
    
    (async() => {     
        try {
            const browser = await puppeteer.launch({
                headless:true,
                args: [
                    '–disable-gpu', // GPU硬件加速
                    '–disable-dev-shm-usage', // 创建临时文件共享内存
                    '–disable-setuid-sandbox', // uid沙盒
                    '–no-first-run', // 没有设置首页。在启动的时候,就会打开一个空白页面。
                    '–no-sandbox', // 沙盒模式
                    '–no-zygote',
                    '–single-process' // 单进程运行         
                ]
            });  
            const page = await browser.newPage();
    
            await page.goto('http://es6.ruanyifeng.com/#README',{
                'timeout': 0 //无限大
            });                             
            let aTags = await (await page).evaluate(() => {
                let as = [...document.querySelectorAll('ol li a')];
                return as.map((a) =>{
                    return {
                      href: a.href.trim(),
                      name: a.text
                    }
                });        
            });
            // console.log(aTags)    
            // await page.pdf({path: `./output/${aTags[0].name}.pdf`,format:'a4',printBackground:true});  
        
            for (var i = 1; i < aTags.length;i++){
                pageS = await browser.newPage();
                var a = aTags[i];
                console.log("完成个数:"+i);                  
                await pageS.goto(a.href,{'timeout': 0});      
                await pageS.pdf({path: `/Users/shiyuxiang/develop/www/Ecc3.0_System01/output/${a.name}.pdf`,format:'a4'});
                pageS.close();  
            }               
            console.log("完成") 
            browser.close();              
        } catch (err) {
            console.log(`this is the ${err}`);
        }                   
    })();
    

    执行


    $ node crawl.js
    

    结果


    file

    将PDF可并到一块

    $ sudo apt-get install pdftk
    
    $ cnpm i pdf-merge
    

    脚本

    const PDFMerge = require('pdf-merge');
    const path = require('path');
    const fs = require('fs');
    // const { formatTime } = require('./modules/utils');
    
    /**
     * @desc 返回路径
     * @param {String} dir, dir2 字符串
     * @return {String} 路径
     */
    function resolve(dir, dir2 = ''){
    	return path.posix.join(__dirname, './', dir, dir2);
    }
    
    // 配置
    const config = {       
    	entry: './output/',
    	output: './data/'     
    };   
    
    // 
    const filenameArr = fs.readdirSync(resolve(config.entry));
    
    const sortedFilenameArr = filenameArr.sort((str1, str2) => {
    	let regex = /^(\d{1,2})\./;
    	let a = +str1.match(regex);
    	let b = +str2.match(regex);
    	return a - b;
    });
    
    // console.log(sortedFilenameArr);
    
    const files = sortedFilenameArr.map((el) => {
    	return resolve(`${config.entry}${el}`);
    });
    
    console.log('files', files);   
    
    const outputPath = resolve(config.output);
    
    const isExists = fs.existsSync(outputPath);
    
    console.log('isExists', isExists, 'outputPath', outputPath);
    
    /**
     * @desc 创建输出路径
     */
    function mkdirOutputpath(){  
    	try{
    		fs.mkdirSync(outputPath);
    		console.log('mkdir is successful!');
    	} catch(e){
    		console.log('mkdir is failed!', e);
    	}
    };
    // 如果不存在 则创建
    if(!isExists){
    	mkdirOutputpath();
    }
    
    console.log('let\'s start merge...');
    
    const filename = `ES6 入门教程-${Date.now()}.pdf`;    
    
    // console.log(filename);   
    
    
    const output = resolve(`${config.output}${filename}`);
    
    // console.log(output);
    
    // Save as new file
    PDFMerge(files, {         
    	output: output,
    })
    .then((buffer) => {
    	console.log('merge is successful!');
    });
    

    结果

    file

关键字Puppeteer

Collect from 雨夜的博客 雨夜的博客