From a2c8f2f6fd04e46c562f32118672da48048ca72d Mon Sep 17 00:00:00 2001 From: bianjiajie Date: Sun, 15 Jan 2023 21:24:51 +0800 Subject: [PATCH] =?UTF-8?q?[feat]=20=E7=88=AC=E5=8F=96=E5=9B=BE=E6=96=87?= =?UTF-8?q?=E5=8A=A8=E6=80=81=E7=9A=84=E5=9B=BE=E7=89=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- internal/app/api/idl/bilibili_picture.go | 26 ++++ internal/app/spider/picture.go | 167 +++++++++++++++++++++++ internal/pkg/bilibili/video.go | 20 +-- internal/repository/bilibili_picture.go | 60 ++++++++ 4 files changed, 263 insertions(+), 10 deletions(-) create mode 100644 internal/app/api/idl/bilibili_picture.go create mode 100644 internal/app/spider/picture.go create mode 100644 internal/repository/bilibili_picture.go diff --git a/internal/app/api/idl/bilibili_picture.go b/internal/app/api/idl/bilibili_picture.go new file mode 100644 index 0000000..0101591 --- /dev/null +++ b/internal/app/api/idl/bilibili_picture.go @@ -0,0 +1,26 @@ +package idl + +type BilibiliPicture struct { + ID uint64 `gorm:"primarykey"` + Url string `gorm:"column:url"` + DynamicID uint64 `gorm:"column:dynamic_id"` + TopicName string `gorm:"column:topic_name"` + SentAt uint64 `gorm:"column:sent_at"` + CreatedAt uint64 `gorm:"autoCreateTime"` + UpdatedAt uint64 `gorm:"autoUpdateTime"` +} + +type BilibiliPictureDTO struct { + ID uint64 `json:"id"` + Url string `json:"url"` + CreatedAt uint64 `json:"created_at"` +} + +func (BilibiliPicture) TableName() string { + return "bilibili_pictures" +} + +type BilibiliPictureRepository interface { + Create(items []*BilibiliPicture) error + FindMaxDynamicID(topicName string) (*uint64, error) +} diff --git a/internal/app/spider/picture.go b/internal/app/spider/picture.go new file mode 100644 index 0000000..ea6c0ca --- /dev/null +++ b/internal/app/spider/picture.go @@ -0,0 +1,167 @@ +package spider + +import ( + "context" + "encoding/json" + "fmt" + "strconv" + "time" + + "git.vtb.link/eoefans/internal/app/api/idl" + "git.vtb.link/eoefans/internal/pkg/bilibili" + "git.vtb.link/eoefans/internal/repository" + "github.com/pkg/errors" + "go.uber.org/zap" + "gorm.io/gorm" +) + +type Picture struct { + stopChan chan bool + db *gorm.DB + logger *zap.Logger + sdk *bilibili.SDK + isRunning bool +} + +func NewPicture(db *gorm.DB, logger *zap.Logger, sdk *bilibili.SDK) *Picture { + return &Picture{ + stopChan: make(chan bool), + db: db, + logger: logger, + sdk: sdk, + } +} + +func (v *Picture) Stop(ctx context.Context) error { + v.logger.Info("stopping spider server") + + for { + select { + case <-ctx.Done(): + return errors.New("shutdown spider server timeout") + default: + if err := v.stop(); err != nil { + return errors.Wrap(err, "shutdown spider server error") + } + return nil + } + } +} + +func (v *Picture) stop() error { + v.stopChan <- true + v.isRunning = false + return nil +} + +func (v *Picture) Run(ctx context.Context) error { + tk := time.NewTicker(60 * time.Minute) + v.isRunning = true + + go func() { + if err := v.spider(); err != nil { + v.logger.Error("start spider server error", zap.Error(err)) + } + }() + + go func(_tk *time.Ticker) { + for { + select { + case <-_tk.C: + v.logger.Info("[tick] picture spider", zap.Time("time", time.Now())) + if err := v.spider(); err != nil { + v.logger.Error("start picture server error", zap.Error(err)) + } + case <-v.stopChan: + return + } + } + }(tk) + + return nil +} + +func (p *Picture) spider() error { + //把当前数据库最大的动态ID查出来 + //调用接口,将大于当前动态ID的都入DB,如果存在小于的,则可提前结束,尽量保证没有重复数据 + topics := []string{ + bilibili.TopicNameGoGo, + bilibili.TopicNameMino, + bilibili.TopicNameUn, + bilibili.TopicNameMoMo, + bilibili.TopicNameWan, + bilibili.TopicNameEOE, + } + for _, topic := range topics { + curMaxDynamicID, err := repository.NewBilibiliPicture(p.db).FindMaxDynamicID(topic) + if err != nil { + p.logger.Error("FindMaxDynamicID error", zap.String("topic_name", topic), zap.Error(err)) + continue + } + var hasMore uint = 1 + var offset uint64 = 0 + exist := false //判断有没有已经爬过 + for hasMore == 1 && !exist { + data, err := p.sdk.TopicDynamic(topic, offset) + if err != nil { + p.logger.Error("TopicDynamic error", zap.String("topic_name", topic), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err)) + time.Sleep(500 * time.Millisecond) + break + } + hasMore = data.HasMore + dynamicID, err := strconv.ParseUint(data.Offset, 10, 64) + if err == nil { + offset = dynamicID + } else { + hasMore = 0 + } + items := make([]*idl.BilibiliPicture, 0) + for _, v := range data.Cards { + switch v.Desc.Type { + case bilibili.DynamicDraw: + dynamicID, _ := strconv.ParseUint(v.Desc.DynamicId, 10, 64) + if dynamicID <= *curMaxDynamicID { + //后面所有的都是爬过的,提前结束,后续也不再请求api + exist = true + break + } + pictures, err := parsePicturesFromCard(v.Card) + if err != nil { + p.logger.Error("ParsePicturesFromCard error", zap.String("topic_name", topic), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err)) + continue + } + for _, url := range pictures { + items = append(items, &idl.BilibiliPicture{ + Url: url, + DynamicID: dynamicID, + TopicName: topic, + SentAt: v.Desc.TimeStamp, + }) + } + default: + continue + } + } + //插入数据 + if len(items) != 0 { + err := repository.NewBilibiliPicture(p.db).Create(items) + if err != nil { + p.logger.Error("Create bilibli_pictures error", zap.String("topic_name", topic), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err)) + } + } + } + } + return nil +} + +func parsePicturesFromCard(data string) ([]string, error) { + var content bilibili.DynamicCardContent + if err := json.Unmarshal([]byte(data), &content); err != nil { + return nil, err + } + pics := make([]string, 0, len(content.Item.Pictures)) + for _, v := range content.Item.Pictures { + pics = append(pics, v.ImgSrc) + } + return pics, nil +} diff --git a/internal/pkg/bilibili/video.go b/internal/pkg/bilibili/video.go index 837189c..a5b82dc 100644 --- a/internal/pkg/bilibili/video.go +++ b/internal/pkg/bilibili/video.go @@ -1,7 +1,6 @@ package bilibili import ( - "encoding/json" "fmt" "net/http" "net/url" @@ -28,15 +27,14 @@ type DynamicType uint const ( DynamicDraw DynamicType = 2 //图片动态 ) - const ( //topicHistory用topic_id查出来的数据有问题,故暂时用topic_name - topNameWan = "小莞熊在这里" - topNameUn = "柚恩的蜜罐子" - topNameGoGo = "GOGO队立大功!" //中文感叹号 - topNameMoMo = "虞你在一起" - topNameMino = "和米诺的对抗路日常" - topNameEOE = "EOE的魔法盒" + TopicNameWan = "小莞熊在这里" + TopicNameUn = "柚恩的蜜罐子" + TopicNameGoGo = "GOGO队立大功!" //中文感叹号 + TopicNameMoMo = "虞你在一起" + TopicNameMino = "和米诺的对抗路日常" + TopicNameEOE = "EOE的魔法盒" ) type SDK struct { @@ -62,9 +60,11 @@ type DynamicInfo struct { type DynamicCard struct { Desc struct { - Type DynamicType `json:"type"` + Type DynamicType `json:"type"` + DynamicId string `json:"dynamic_id"` + TimeStamp uint64 `json:"timestamp"` } `json:"desc"` - Card json.RawMessage `json:"card"` + Card string `json:"card"` } // Card是json字符串,需要进一步解析 diff --git a/internal/repository/bilibili_picture.go b/internal/repository/bilibili_picture.go new file mode 100644 index 0000000..5d63380 --- /dev/null +++ b/internal/repository/bilibili_picture.go @@ -0,0 +1,60 @@ +package repository + +import ( + "git.vtb.link/eoefans/internal/app/api/idl" + "gorm.io/gorm" +) + +func NewBilibiliPicture(tx *gorm.DB) idl.BilibiliPictureRepository { + return &BilibiliPictureMysqlImpl{tx: tx} +} + +type BilibiliPictureMysqlImpl struct { + tx *gorm.DB +} + +func (impl *BilibiliPictureMysqlImpl) Create(items []*idl.BilibiliPicture) error { + if len(items) == 0 { + return nil + } + //针对url去重 + return impl.tx.Transaction(func(_tx *gorm.DB) error { + urls := make([]string, 0, len(items)) + for _, v := range items { + urls = append(urls, v.Url) + } + var exist []*idl.BilibiliPicture + err := _tx.Table(idl.BilibiliPicture{}.TableName()).Where("url in (?)", urls).Distinct("url").Find(&exist).Error + if err != nil { + return err + } + filter := make([]*idl.BilibiliPicture, 0) + for i := range items { + find := false + for j := range exist { + if items[i].Url == exist[j].Url { + find = true + break + } + } + if !find { + filter = append(filter, items[i]) + } + } + err = _tx.Table(idl.BilibiliPicture{}.TableName()).Create(&filter).Error + if err != nil { + return err + } + return nil + }) +} + +func (impl *BilibiliPictureMysqlImpl) FindMaxDynamicID(topicName string) (*uint64, error) { + var id uint64 + conn := impl.tx.Table(idl.BilibiliPicture{}.TableName()) + err := conn.Select("max(dynamic_id) as id").Where("topic_name = ?", topicName).Group("dynamic_id").Scan(&id).Error + if err != nil { + return nil, err + } + return &id, nil +}