package main
import (
"fmt"
"io"
"net/http"
"os"
"regexp"
"strconv"
)
func HttpGet(url string)(result string,err error){
resp,err1 := http.Get(url)
if err1 != nil{
err = err1
}
defer resp.Body.Close()
//接收数据
buf := make([]byte,2048)
for{
n,err2 := resp.Body.Read(buf)
if n == 0{
break
}
if err2 != nil && err2 != io.EOF{
err = err2
}
result += string(buf[:n])
}
return
}
func DownImg(i int,url string,page chan int) {
endstr := url[len(url)-3:]
path := "D:\\www\\gostudy\\pachong\\img\\"+strconv.Itoa(i)+"."+endstr
fp,err:= os.Create(path)
if err != nil{
fmt.Println("文件创建失败",err)
}
defer fp.Close()
url = "https:" + url
imgresp,err2 := http.Get(url)
if err2 != nil{
fmt.Println("图片文件下载失败")
}
//写入文件
buf := make([]byte,2048)
for{
n,err3 := imgresp.Body.Read(buf)
if n ==0{
break
}
if err3 != nil && err3 != io.EOF{
fmt.Println("图片文件读取失败")
}
fp.Write(buf[:n])
}
page <- i
}
func SpiderWeb(url string) {
html,err := HttpGet(url)
if err != nil{
fmt.Println("页面读取出错")
}
//使用正则表达式提取
ret := regexp.MustCompile(`data-original="(.*)"`)
// 提取需要的信息
imglist := ret.FindAllStringSubmatch(html, -1)
page := make(chan int)
for k,img := range imglist{
go DownImg(k,img[1],page)
}
for i:=0;i < len(imglist);i++{
fmt.Printf("图片%d下载完成\n",<-page)
}
}
func Working() {
url := "https://www.yy.com/catalog"
SpiderWeb(url)
}
func main() {
Working()
}