package main

import (
	"fmt"
	"io"
	"net/http"
	"os"
	"regexp"
	"strconv"
)

func HttpGet(url string) (result string,err error) {
	resp,err1 := http.Get(url)
	if err1 != nil{
		err = err1
	}
	defer resp.Body.Close()

	buf := make([]byte,2048)
	for{
		n,err2 := resp.Body.Read(buf)
		if n == 0{
			break
		}
		if err2 != nil && err2 != io.EOF{
			err = err2
		}
		result += string(buf[:n])
	}
	return
}

func GetPage(url string)(content string,err error)  {
	html,err1 := HttpGet(url)
	if err1 != nil{
		err = err1
	}

	ret := regexp.MustCompile(`

(.*)

`) res := ret.FindAllStringSubmatch(html,1) for _,k := range res{ content = k[1] } return } func SaveFile(i int,con []string){ path := "D:\\www\\gostudy\\pachong\\html\\dz-"+ strconv.Itoa(i) + ".txt" fp,err := os.Create(path) if err != nil{ fmt.Println("创建文件失败") return } defer fp.Close() for _,k := range con{ if len(k) > 0 { fp.WriteString(k+"\r\n") fp.WriteString("---------------------------"+"\r\n") } } } func SpiderWeb(i int,c chan int) { url := "https://www.xiaohua.com/duanzi?page=" + strconv.Itoa(i) result,err := HttpGet(url) if err != nil{ fmt.Println("Error:",err) return } //提取URL ret := regexp.MustCompile(`data-initialized="true" dataid="(\d+)">`) //提取正则 links := ret.FindAllStringSubmatch(result,-1) conlist := make([]string,0) for _,link := range links{ linkurl := "https://www.xiaohua.com/detail/" + link[1] //fmt.Println(linkurl) //获取单个页面内容 con,err2 := GetPage(linkurl) if err2 != nil{ fmt.Println("Spider GetPage Error:",err2) } conlist = append(conlist,con) } //保存文件 SaveFile(i,conlist) //通道传输 c <- i } func Working(start,end int){ c := make(chan int) for i:=start;i <= end ;i++{ go SpiderWeb(i,c) } for i:=start;i <= end ;i++ { fmt.Printf("第%d页爬取完成\n", <-c) } } func main() { var start,end int fmt.Print("请输入起始页:") fmt.Scan(&start) fmt.Print("请输入结束页:") fmt.Scan(&end) Working(start,end) }