Skip to content

Commit

Permalink
[feat] 爬取图文动态的图片
Browse files Browse the repository at this point in the history
  • Loading branch information
kyrie committed Jan 15, 2023
1 parent 2b710b4 commit a2c8f2f
Show file tree
Hide file tree
Showing 4 changed files with 263 additions and 10 deletions.
26 changes: 26 additions & 0 deletions internal/app/api/idl/bilibili_picture.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package idl

type BilibiliPicture struct {
ID uint64 `gorm:"primarykey"`
Url string `gorm:"column:url"`
DynamicID uint64 `gorm:"column:dynamic_id"`
TopicName string `gorm:"column:topic_name"`
SentAt uint64 `gorm:"column:sent_at"`
CreatedAt uint64 `gorm:"autoCreateTime"`
UpdatedAt uint64 `gorm:"autoUpdateTime"`
}

type BilibiliPictureDTO struct {
ID uint64 `json:"id"`
Url string `json:"url"`
CreatedAt uint64 `json:"created_at"`
}

func (BilibiliPicture) TableName() string {
return "bilibili_pictures"
}

type BilibiliPictureRepository interface {
Create(items []*BilibiliPicture) error
FindMaxDynamicID(topicName string) (*uint64, error)
}
167 changes: 167 additions & 0 deletions internal/app/spider/picture.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package spider

import (
"context"
"encoding/json"
"fmt"
"strconv"
"time"

"git.vtb.link/eoefans/internal/app/api/idl"
"git.vtb.link/eoefans/internal/pkg/bilibili"
"git.vtb.link/eoefans/internal/repository"
"github.com/pkg/errors"
"go.uber.org/zap"
"gorm.io/gorm"
)

type Picture struct {
stopChan chan bool
db *gorm.DB
logger *zap.Logger
sdk *bilibili.SDK
isRunning bool
}

func NewPicture(db *gorm.DB, logger *zap.Logger, sdk *bilibili.SDK) *Picture {
return &Picture{
stopChan: make(chan bool),
db: db,
logger: logger,
sdk: sdk,
}
}

func (v *Picture) Stop(ctx context.Context) error {
v.logger.Info("stopping spider server")

for {
select {
case <-ctx.Done():
return errors.New("shutdown spider server timeout")
default:
if err := v.stop(); err != nil {
return errors.Wrap(err, "shutdown spider server error")
}
return nil
}
}
}

func (v *Picture) stop() error {
v.stopChan <- true
v.isRunning = false
return nil
}

func (v *Picture) Run(ctx context.Context) error {
tk := time.NewTicker(60 * time.Minute)
v.isRunning = true

go func() {
if err := v.spider(); err != nil {
v.logger.Error("start spider server error", zap.Error(err))
}
}()

go func(_tk *time.Ticker) {
for {
select {
case <-_tk.C:
v.logger.Info("[tick] picture spider", zap.Time("time", time.Now()))
if err := v.spider(); err != nil {
v.logger.Error("start picture server error", zap.Error(err))
}
case <-v.stopChan:
return
}
}
}(tk)

return nil
}

func (p *Picture) spider() error {
//把当前数据库最大的动态ID查出来
//调用接口,将大于当前动态ID的都入DB,如果存在小于的,则可提前结束,尽量保证没有重复数据
topics := []string{
bilibili.TopicNameGoGo,
bilibili.TopicNameMino,
bilibili.TopicNameUn,
bilibili.TopicNameMoMo,
bilibili.TopicNameWan,
bilibili.TopicNameEOE,
}
for _, topic := range topics {
curMaxDynamicID, err := repository.NewBilibiliPicture(p.db).FindMaxDynamicID(topic)
if err != nil {
p.logger.Error("FindMaxDynamicID error", zap.String("topic_name", topic), zap.Error(err))
continue
}
var hasMore uint = 1
var offset uint64 = 0
exist := false //判断有没有已经爬过
for hasMore == 1 && !exist {
data, err := p.sdk.TopicDynamic(topic, offset)
if err != nil {
p.logger.Error("TopicDynamic error", zap.String("topic_name", topic), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
time.Sleep(500 * time.Millisecond)
break
}
hasMore = data.HasMore
dynamicID, err := strconv.ParseUint(data.Offset, 10, 64)
if err == nil {
offset = dynamicID
} else {
hasMore = 0
}
items := make([]*idl.BilibiliPicture, 0)
for _, v := range data.Cards {
switch v.Desc.Type {
case bilibili.DynamicDraw:
dynamicID, _ := strconv.ParseUint(v.Desc.DynamicId, 10, 64)
if dynamicID <= *curMaxDynamicID {
//后面所有的都是爬过的,提前结束,后续也不再请求api
exist = true
break
}
pictures, err := parsePicturesFromCard(v.Card)
if err != nil {
p.logger.Error("ParsePicturesFromCard error", zap.String("topic_name", topic), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
continue
}
for _, url := range pictures {
items = append(items, &idl.BilibiliPicture{
Url: url,
DynamicID: dynamicID,
TopicName: topic,
SentAt: v.Desc.TimeStamp,
})
}
default:
continue
}
}
//插入数据
if len(items) != 0 {
err := repository.NewBilibiliPicture(p.db).Create(items)
if err != nil {
p.logger.Error("Create bilibli_pictures error", zap.String("topic_name", topic), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
}
}
}
}
return nil
}

func parsePicturesFromCard(data string) ([]string, error) {
var content bilibili.DynamicCardContent
if err := json.Unmarshal([]byte(data), &content); err != nil {
return nil, err
}
pics := make([]string, 0, len(content.Item.Pictures))
for _, v := range content.Item.Pictures {
pics = append(pics, v.ImgSrc)
}
return pics, nil
}
20 changes: 10 additions & 10 deletions internal/pkg/bilibili/video.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package bilibili

import (
"encoding/json"
"fmt"
"net/http"
"net/url"
Expand All @@ -28,15 +27,14 @@ type DynamicType uint
const (
DynamicDraw DynamicType = 2 //图片动态
)

const (
//topicHistory用topic_id查出来的数据有问题,故暂时用topic_name
topNameWan = "小莞熊在这里"
topNameUn = "柚恩的蜜罐子"
topNameGoGo = "GOGO队立大功!" //中文感叹号
topNameMoMo = "虞你在一起"
topNameMino = "和米诺的对抗路日常"
topNameEOE = "EOE的魔法盒"
TopicNameWan = "小莞熊在这里"
TopicNameUn = "柚恩的蜜罐子"
TopicNameGoGo = "GOGO队立大功!" //中文感叹号
TopicNameMoMo = "虞你在一起"
TopicNameMino = "和米诺的对抗路日常"
TopicNameEOE = "EOE的魔法盒"
)

type SDK struct {
Expand All @@ -62,9 +60,11 @@ type DynamicInfo struct {

type DynamicCard struct {
Desc struct {
Type DynamicType `json:"type"`
Type DynamicType `json:"type"`
DynamicId string `json:"dynamic_id"`
TimeStamp uint64 `json:"timestamp"`
} `json:"desc"`
Card json.RawMessage `json:"card"`
Card string `json:"card"`
}

// Card是json字符串,需要进一步解析
Expand Down
60 changes: 60 additions & 0 deletions internal/repository/bilibili_picture.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package repository

import (
"git.vtb.link/eoefans/internal/app/api/idl"
"gorm.io/gorm"
)

func NewBilibiliPicture(tx *gorm.DB) idl.BilibiliPictureRepository {
return &BilibiliPictureMysqlImpl{tx: tx}
}

type BilibiliPictureMysqlImpl struct {
tx *gorm.DB
}

func (impl *BilibiliPictureMysqlImpl) Create(items []*idl.BilibiliPicture) error {
if len(items) == 0 {
return nil
}
//针对url去重
return impl.tx.Transaction(func(_tx *gorm.DB) error {
urls := make([]string, 0, len(items))
for _, v := range items {
urls = append(urls, v.Url)
}
var exist []*idl.BilibiliPicture
err := _tx.Table(idl.BilibiliPicture{}.TableName()).Where("url in (?)", urls).Distinct("url").Find(&exist).Error
if err != nil {
return err
}
filter := make([]*idl.BilibiliPicture, 0)
for i := range items {
find := false
for j := range exist {
if items[i].Url == exist[j].Url {
find = true
break
}
}
if !find {
filter = append(filter, items[i])
}
}
err = _tx.Table(idl.BilibiliPicture{}.TableName()).Create(&filter).Error
if err != nil {
return err
}
return nil
})
}

func (impl *BilibiliPictureMysqlImpl) FindMaxDynamicID(topicName string) (*uint64, error) {
var id uint64
conn := impl.tx.Table(idl.BilibiliPicture{}.TableName())
err := conn.Select("max(dynamic_id) as id").Where("topic_name = ?", topicName).Group("dynamic_id").Scan(&id).Error
if err != nil {
return nil, err
}
return &id, nil
}

0 comments on commit a2c8f2f

Please sign in to comment.