# Go爬虫
# Flag
- https://github.com/go-rod/rod (opens new window)
 - https://github.com/derekparker/delve (opens new window)
 - https://github.com/mafredri/cdp (opens new window)
 - https://github.com/projectdiscovery/katana (opens new window)
 - 类BeautifulSoup https://github.com/anaskhan96/soup (opens new window)
 - 示例 https://github.com/bestnite/bilinovel-downloader (opens new window)
 
# HTML解析
- https://github.com/antchfx (opens new window) htmlquery使用xpath选择器
 - https://github.com/PuerkitoBio/goquery (opens new window) goquery使用css选择器
 - https://github.com/gocolly/colly (opens new window)
 - https://github.com/go-predator/predator (opens new window)
 
# chromedp
该组织下有以下项目
https://github.com/chromedp/chromedp
https://godoc.org/github.com/chromedp/chromedp (opens new window)
https://github.com/chromedp/cdproto-gen
生成用于Chrome DevTools (opens new window)协议的命令,事件和类型的Go代码
https://github.com/chromedp/cdproto
包含针对Chrome DevTools (opens new window)协议域生成的命令,类型和事件
https://godoc.org/github.com/chromedp/cdproto (opens new window)
cdproto下有以下包
github.com/chromedp/cdproto/cdp
cdp.TimeSinceEpoch(time.Now().Add(180 * 24 * time.Hour))
github.com/chromedp/cdproto/network网络处理
比如启用启用网络跟踪,现在将网络事件传递给客户端
network.Enable()
github.com/chromedp/cdproto/pagechrome页面处理
比如设置下载
page.SetDownloadBehavior(page.SetDownloadBehaviorBehaviorDeny)
github.com/chromedp/cdproto/target操作标签页 Target.createTarget (opens new window)
// 根据url新建浏览器标签页
tag := target.CreateTarget(url)
// 新建浏览器标签页ID创建上下文,方式一
ctx, cancel = chromedp.NewContext(ctx, chromedp.WithTargetID(target.ID(tag.BrowserContextID)))
// 根据上下文,返回targetID-打开的页面的ID。
tagId, err := tag.Do(ctx)
// 新建浏览器标签页ID创建上下文,方式二
ctx, cancel = chromedp.NewContext(ctx, chromedp.WithTargetID(tagId))
chromedp.Targets(ctx)获取浏览器实例的标签页信息列表
chromedp能做什么
反爬虫js,例如有的网页后台js自动发送心跳包,浏览器里会自动运行,不需要我们自动处理
针对于前端页面的自动化测试
解决类似VueJS和SPA之类的渲染
解决网页的懒加载
网页截图和pdf导出,而不需要额外的去学习其他的库实现
seo训练和刷点击量
执行javascript 代码
设置dom的标签属性
chromedp同Python的selenium,它是使用Chrome Debugging Protocol(简称cdp) 并且没有外部依赖 (如Selenium,PhantomJS等)
| 函数或变量 | 说明 | 
|---|---|
| (a *ExecAllocator) Allocate(ctx context.Context, opts ...BrowserOption) (*Browser, error) | |
| (a *ExecAllocator) Wait() | |
| (a *RemoteAllocator) Allocate(ctx context.Context, opts ...BrowserOption) (*Browser, error) | |
| (a *RemoteAllocator) Wait() | |
| (f ActionFunc) Do(ctx context.Context) error | |
| (s *Selector) Do(ctx context.Context) error | |
| (t Tasks) Do(ctx context.Context) error | |
| After(f func(context.Context, ...*cdp.Node) error) QueryOption | |
| AtLeast(n int) QueryOption | |
| Attributes(sel interface{}, attributes *map[string]string, opts ...QueryOption) QueryAction | |
| AttributesAll(sel interface{}, attributes *[]map[string]string, opts ...QueryOption) QueryAction | |
| AttributeValue(sel interface{}, name string, value *string, ok *bool, opts ...QueryOption) QueryAction | |
| Blur(sel interface{}, opts ...QueryOption) QueryAction | |
| ByFunc(f func(context.Context, *cdp.Node) ([]cdp.NodeID, error)) QueryOption | |
| ByID(s *Selector) | 只id来选择元素 | 
| ByJSPath(s *Selector) | |
| ByNodeID(s *Selector) | 按节点ID选择元素 | 
| ByQuery(s *Selector) | 根据document.querySelector的规则选择元素,返回单个节点 | 
| ByQueryAll(s *Selector) | 根据document.querySelectorAll返回所有匹配的节点 | 
| BySearch(s *Selector) | 如果不写,默认会使用这个选择器,类似devtools ctrl+f 搜索 | 
| Cancel(ctx context.Context) error | |
| CaptureScreenshot(res *[]byte) Action | |
| Clear(sel interface{}, opts ...QueryOption) QueryAction | |
| Click(sel interface{}, opts ...QueryOption) QueryAction | 触发点击事件 | 
| CombinedOutput(w io.Writer) ExecAllocatorOption | |
| ComputedStyle(sel interface{}, style *[]*css.ComputedProperty, opts ...QueryOption) QueryAction | |
| Dimensions(sel interface{}, model **dom.BoxModel, opts ...QueryOption) QueryAction | |
| DisableGPU(a *ExecAllocator) | 关闭gpu,服务器一般没有显卡 | 
| DoubleClick(sel interface{}, opts ...QueryOption) QueryAction | |
| Env(vars ...string) ExecAllocatorOption | NAME=value形式的通用环境变量列表,用于传递到新的Chrome进程。 | 
| ExecPath(path string) ExecAllocatorOption | |
| Flag(name string, value interface{}) ExecAllocatorOption | |
| Focus(sel interface{}, opts ...QueryOption) QueryAction | |
| FromContext(ctx context.Context) *Context | |
| Headless(a *ExecAllocator) | |
| InnerHTML(sel interface{}, html *string, opts ...QueryOption) QueryAction | |
| JavascriptAttribute(sel interface{}, name string, res interface{}, opts ...QueryOption) QueryAction | |
| ListenBrowser(ctx context.Context, fn func(ev interface{})) | |
| ListenTarget(ctx context.Context, fn func(ev interface{})) | |
| Location(urlstr *string) Action | |
| MatchedStyle(sel interface{}, style **css.GetMatchedStylesForNodeReturns, opts ...QueryOption) QueryAction | |
| Navigate(urlstr string) NavigateAction | 设置url | 
| NavigateBack() NavigateAction | |
| NavigateForward() NavigateAction | |
| NavigateToHistoryEntry(entryID int64) NavigateAction | |
| NavigationEntries(currentIndex *int64, entries *[]*page.NavigationEntry) NavigateAction | |
| NewContext(parent context.Context, opts ...ContextOption) (context.Context, context.CancelFunc) | |
| NewExecAllocator(parent context.Context, opts ...ExecAllocatorOption) (context.Context, context.CancelFunc) | |
| NewRemoteAllocator(parent context.Context, url string) (context.Context, context.CancelFunc) | |
| NodeEnabled(s *Selector) | |
| NoDefaultBrowserCheck(a *ExecAllocator) | 禁用默认浏览器检查。 | 
| NodeIDs(sel interface{}, ids *[]cdp.NodeID, opts ...QueryOption) QueryAction | |
| NodeNotPresent(s *Selector) | |
| NodeNotVisible(s *Selector) | |
| NodeReady(s *Selector) | |
| Nodes(sel interface{}, nodes *[]*cdp.Node, opts ...QueryOption) QueryAction | |
| NodeSelected(s *Selector) | |
| NodeVisible(s *Selector) | |
| NoFirstRun(a *ExecAllocator) | 禁用第一次运行对话框的Chrome命令行选项 | 
| NoSandbox(a *ExecAllocator) | 用于禁用沙箱的Chrome命令行选项 | 
| OuterHTML(sel interface{}, html *string, opts ...QueryOption) QueryAction | 获取HTML源码 | 
| ProxyServer(proxy string) ExecAllocatorOption | |
| Query(sel interface{}, opts ...QueryOption) QueryAction | |
| QueryAfter(sel interface{}, f func(context.Context, ...*cdp.Node) error, opts ...QueryOption) QueryAction | |
| Reload() NavigateAction | |
| RemoveAttribute(sel interface{}, name string, opts ...QueryOption) QueryAction | |
| Reset(sel interface{}, opts ...QueryOption) QueryAction | |
| Run(ctx context.Context, actions ...Action) error | |
| Screenshot(sel interface{}, picbuf *[]byte, opts ...QueryOption) QueryAction | 标签截图,赋值给变量 | 
| ScrollIntoView(sel interface{}, opts ...QueryOption) QueryAction | |
| SendKeys(sel interface{}, v string, opts ...QueryOption) QueryAction | 使用chromedp.ByID选择器。向标签输入内容。 | 
| SetAttributes(sel interface{}, attributes map[string]string, opts ...QueryOption) QueryAction | |
| SetAttributeValue(sel interface{}, name, value string, opts ...QueryOption) QueryAction | |
| SetJavascriptAttribute(sel interface{}, name, value string, opts ...QueryOption) QueryAction | |
| SetUploadFiles(sel interface{}, files []string, opts ...QueryOption) QueryAction | |
| SetValue(sel interface{}, value string, opts ...QueryOption) QueryAction | |
| Sleep(d time.Duration) Action | 等待时间 | 
| Stop() NavigateAction | |
| Submit(sel interface{}, opts ...QueryOption) QueryAction | |
| Targets(ctx context.Context) ([]*target.Info, error) | |
| Text(sel interface{}, text *string, opts ...QueryOption) QueryAction | |
| TextContent(sel interface{}, text *string, opts ...QueryOption) QueryAction | |
| Title(title *string) Action | |
| UserAgent(userAgent string) ExecAllocatorOption | |
| UserDataDir(dir string) ExecAllocatorOption | |
| Value(sel interface{}, value *string, opts ...QueryOption) QueryAction | 取值并赋值给变量 | 
| WaitEnabled(sel interface{}, opts ...QueryOption) QueryAction | |
| WaitFunc(wait func(context.Context, *cdp.Frame, ...cdp.NodeID) ([]*cdp.Node, error)) QueryOption | |
| waitLoaded(ctx context.Context) error | |
| WaitNewTarget(ctx context.Context, fn func(*target.Info) bool) <-chan target.ID | |
| WaitNotPresent(sel interface{}, opts ...QueryOption) QueryAction | |
| WaitNotVisible(sel interface{}, opts ...QueryOption) QueryAction | |
| WaitReady(sel interface{}, opts ...QueryOption) QueryAction | |
| WaitSelected(sel interface{}, opts ...QueryOption) QueryAction | |
| WaitVisible(sel interface{}, opts ...QueryOption) QueryAction | 等待指定标签元素加载完成 | 
| WindowSize(width, height int) ExecAllocatorOption | 设置初始窗口大小的命令行选项。 | 
| WithBrowserOption(opts ...BrowserOption) ContextOption | |
| WithDebugf(f func(string, ...interface{})) ContextOption | |
| WithErrorf(f func(string, ...interface{})) ContextOption | |
| WithLogf(f func(string, ...interface{})) ContextOption | |
| WithTargetID(id target.ID) ContextOption | 
# godet
- 截图
 
package main
import "fmt"
import "time"
import "github.com/raff/godet"
func main() {
	// connect to Chrome instance
	remote, err := godet.Connect("localhost:9222", false)
	if err != nil {
		fmt.Println("cannot connect to Chrome instance:", err)
		return
	}
	// disconnect when done
	defer remote.Close()
	// get browser and protocol version
	version, _ := remote.Version()
	fmt.Println(version)
	// get list of open tabs
	tabs, _ := remote.TabList("")
	fmt.Println(tabs)
	// install some callbacks
	remote.CallbackEvent(godet.EventClosed, func(params godet.Params) {
		fmt.Println("RemoteDebugger connection terminated.")
	})
	remote.CallbackEvent("Network.requestWillBeSent", func(params godet.Params) {
		fmt.Println("requestWillBeSent",
			params["type"],
			params["documentURL"],
			params["request"].(map[string]interface{})["url"])
	})
	remote.CallbackEvent("Network.responseReceived", func(params godet.Params) {
		fmt.Println("responseReceived",
			params["type"],
			params["response"].(map[string]interface{})["url"])
	})
	remote.CallbackEvent("Log.entryAdded", func(params godet.Params) {
		entry := params["entry"].(map[string]interface{})
		fmt.Println("LOG", entry["type"], entry["level"], entry["text"])
	})
	// block loading of most images
	_ = remote.SetBlockedURLs("*.jpg", "*.png", "*.gif")
	// create new tab
	tab, _ := remote.NewTab("https://www.google.com")
	fmt.Println(tab)
	// enable event processing
	remote.RuntimeEvents(true)
	remote.NetworkEvents(true)
	remote.PageEvents(true)
	remote.DOMEvents(true)
	remote.LogEvents(true)
	// navigate in existing tab
	_ = remote.ActivateTab(tabs[0])
	//remote.StartPreciseCoverage(true, true)
	// re-enable events when changing active tab
	remote.AllEvents(true) // enable all events
	_, _ = remote.Navigate("https://www.google.com")
	// evaluate Javascript expression in existing context
	res, _ := remote.EvaluateWrap(`
            console.log("hello from godet!")
            return 42;
        `)
	fmt.Println(res)
	// take a screenshot
	_ = remote.SaveScreenshot("screenshot.png", 0644, 0, true)
	time.Sleep(time.Second)
	// or save page as PDF
	_ = remote.SavePDF("page.pdf", 0644, godet.PortraitMode(),
            godet.Scale(0.5), godet.Dimensions(6.0, 2.0))
	// if err := remote.SetInputFiles(0, []string{"hello.txt"}); err != nil {
	//     fmt.Println("setInputFiles", err)
	// }
	time.Sleep(5 * time.Second)
	//remote.StopPreciseCoverage()
	r, err := remote.GetPreciseCoverage(true)
	if err != nil {
		fmt.Println("error profiling", err)
	} else {
		fmt.Println(r)
	}
	// Allow downloads
	_ = remote.SetDownloadBehavior(godet.AllowDownload, "/tmp/")
	_, _ = remote.Navigate(url)
	time.Sleep(time.Second)
	// Block downloads
	_ = remote.SetDownloadBehavior(godet.DenyDownload, "")
	_, _ = remote.Navigate(url)
}
# gcd
- 截图
 
package main
import (
	"encoding/base64"
	"flag"
	"fmt"
	"log"
	"net/url"
	"os"
	"runtime"
	"sync"
	"time"
	"github.com/wirepair/gcd"
	"github.com/wirepair/gcd/gcdapi"
)
const (
	numTabs = 5
)
var debugger *gcd.Gcd
var wg sync.WaitGroup
var path string
var dir string
var port string
func init() {
	switch runtime.GOOS {
	case "windows":
		flag.StringVar(&path, "chrome",
            "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
            "path to chrome")
		flag.StringVar(&dir, "dir", "C:\\temp\\", "user directory")
	case "darwin":
		flag.StringVar(&path, "chrome",
            "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
            "path to chrome")
		flag.StringVar(&dir, "dir", "/tmp/", "user directory")
	case "linux":
		flag.StringVar(&path, "chrome", "/usr/bin/chromium-browser", "path to chrome")
		flag.StringVar(&dir, "dir", "/tmp/", "user directory")
	}
	flag.StringVar(&port, "port", "9222", "Debugger port")
}
func main() {
	var err error
	urls := []string{"http://www.google.com", "http://www.veracode.com",
            "http://www.microsoft.com", "http://bbc.co.uk",
            "http://www.reddit.com/r/golang"}
	flag.Parse()
	debugger = gcd.NewChromeDebugger()
	debugger.StartProcess(path, dir, port)
	defer debugger.ExitProcess()
	targets := make([]*gcd.ChromeTarget, numTabs)
	for i := 0; i < numTabs; i++ {
		wg.Add(1)
		targets[i], err = debugger.NewTab()
		if err != nil {
			log.Fatalf("error getting targets")
		}
		page := targets[i].Page
		page.Enable()
		targets[i].Subscribe("Page.loadEventFired", pageLoaded)
		// navigate
		navigateParams := &gcdapi.PageNavigateParams{Url: urls[i]}
		_, _, _, err := page.NavigateWithParams(navigateParams)
		if err != nil {
			log.Fatalf("error: %s\n", err)
		}
	}
	wg.Wait()
	for i := 0; i < numTabs; i++ {
		takeScreenShot(targets[i])
	}
}
func pageLoaded(target *gcd.ChromeTarget, event []byte) {
	target.Unsubscribe("Page.loadEventFired")
	wg.Done()
}
func takeScreenShot(target *gcd.ChromeTarget) {
	dom := target.DOM
	page := target.Page
	doc, err := dom.GetDocument(-1, true)
	if err != nil {
		fmt.Printf("error getting doc: %s\n", err)
		return
	}
	debugger.ActivateTab(target)
	time.Sleep(1 * time.Second) // give it a sec to paint
	u, urlErr := url.Parse(doc.DocumentURL)
	if urlErr != nil {
		fmt.Printf("error parsing url: %s\n", urlErr)
		return
	}
	fmt.Printf("Taking screen shot of: %s\n", u.Host)
	screenShotParams := &gcdapi.PageCaptureScreenshotParams{Format: "png",
                                FromSurface: true}
	img, errCap := page.CaptureScreenshotWithParams(screenShotParams)
	if errCap != nil {
		fmt.Printf("error getting doc: %s\n", errCap)
		return
	}
	imgBytes, errDecode := base64.StdEncoding.DecodeString(img)
	if errDecode != nil {
		fmt.Printf("error decoding image: %s\n", errDecode)
		return
	}
	f, errFile := os.Create(u.Host + ".png")
	defer f.Close()
	if errFile != nil {
		fmt.Printf("error creating image file: %s\n", errFile)
		return
	}
	f.Write(imgBytes)
	debugger.CloseTab(target)
}