GO并发爬取段子列表和内页案例
package main import ( "fmt" "io" "net/http" "os" "regexp" "strconv" ) func HttpGet(url string) (result string,err error) { resp,err1 := http.Get(url) if err1 != nil{ err = err1 } defer resp.Body.Close() buf := make([]byte,2048) for{ n,err2 := resp.Body.Read(buf) if n == 0{ break } if err2 != nil && err2 != io.EOF{ err = err2 } result += string(buf[:n]) } return } func GetPage(url string)(content string,err error) { html,err1 := HttpGet(url) if err1 != nil{ err = err1 } ret := regexp.MustCompile(`(.*)
`) res := ret.FindAllStringSubmatch(html,1) for _,k := range res{ content = k[1] } return } func SaveFile(i int,con []string){ path := "D:\\www\\gostudy\\pachong\\html\\dz-"+ strconv.Itoa(i) + ".txt" fp,err := os.Create(path) if err != nil{ fmt.Println("创建文件失败") return } defer fp.Close() for _,k := range con{ if len(k) > 0 { fp.WriteString(k+"\r\n") fp.WriteString("---------------------------"+"\r\n") } } } func SpiderWeb(i int,c chan int) { url := "https://www.xiaohua.com/duanzi?page=" + strconv.Itoa(i) result,err := HttpGet(url) if err != nil{ fmt.Println("Error:",err) return } //提取URL ret := regexp.MustCompile(`data-initialized="true" dataid="(\d+)">`) //提取正则 links := ret.FindAllStringSubmatch(result,-1) conlist := make([]string,0) for _,link := range links{ linkurl := "https://www.xiaohua.com/detail/" + link[1] //fmt.Println(linkurl) //获取单个页面内容 con,err2 := GetPage(linkurl) if err2 != nil{ fmt.Println("Spider GetPage Error:",err2) } conlist = append(conlist,con) } //保存文件 SaveFile(i,conlist) //通道传输 c <- i } func Working(start,end int){ c := make(chan int) for i:=start;i <= end ;i++{ go SpiderWeb(i,c) } for i:=start;i <= end ;i++ { fmt.Printf("第%d页爬取完成\n", <-c) } } func main() { var start,end int fmt.Print("请输入起始页:") fmt.Scan(&start) fmt.Print("请输入结束页:") fmt.Scan(&end) Working(start,end) }