proxy-detector/pkg/proxy/crawl.go

51 lines
1.2 KiB
Go
Raw Normal View History

package proxy
import (
"context"
2024-12-10 21:13:07 +08:00
"sync"
"time"
"gitea.timerzz.com/kedaya_haitao/proxy-detector/pkg/getter"
healthcheck "gitea.timerzz.com/kedaya_haitao/proxy-detector/pkg/health-check"
"gitea.timerzz.com/kedaya_haitao/proxy-detector/pkg/proxy/structs"
2024-12-10 21:13:07 +08:00
"gitea.timerzz.com/kedaya_haitao/proxy-detector/pkg/worker"
log "github.com/sirupsen/logrus"
)
func CrawlProxies(ctx context.Context, getters []getter.Getter) {
2024-12-10 21:13:07 +08:00
var proxies structs.Proxies
log.Infof("共%d个抓取源", len(getters))
2024-12-10 21:13:07 +08:00
var wg sync.WaitGroup
for _, gtr := range getters {
wg.Add(1)
err := worker.Pool.Submit(func() {
defer wg.Done()
if ps := gtr.Get(); len(ps) > 0 {
proxies.Add(ps)
}
})
if err != nil {
log.Errorln("添加并发任务失败: ", err)
}
}
2024-12-10 21:13:07 +08:00
wg.Wait()
log.Infof("Crawled %d proxies", proxies.Len())
proxyList := healthcheck.CleanBadProxies(ctx, proxies.Get())
log.Infof("Health checked %d proxies", proxyList)
structs.ProxiesList.Add(proxyList)
return
}
func CronCrawl(ctx context.Context, getters []getter.Getter, interval uint64) {
ticker := time.NewTicker(time.Duration(interval) * time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
CrawlProxies(ctx, getters)
}
}
}