Skip to content

Commit

Permalink
feat:调整图片爬虫
Browse files Browse the repository at this point in the history
  • Loading branch information
kyrie committed Feb 5, 2023
1 parent 72edd3f commit 18dcedf
Show file tree
Hide file tree
Showing 8 changed files with 261 additions and 115 deletions.
19 changes: 18 additions & 1 deletion database/init.sql
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,21 @@ create table bilibili_dynamics (

alter table bilibili_dynamics add column topic_details json default null comment '动态的#xxx#' after pictures;
alter table bilibili_dynamics add column feedback int default 0 comment '反馈类型' after dynamic_id;
alter table bilibili_dynamics add column verify boolean default false comment '是否审核过' after feedback;
alter table bilibili_dynamics add column verify boolean default false comment '是否审核过' after feedback;
alter table bilibili_dynamics add column pictures_num int default 0 comment '图片数量' after pictures;
alter table bilibili_dynamics add column actual_topic_id bigint default 0 comment '实际的话题id' after topic_id;
alter table bilibili_dynamics add column cp_topic_id bigint default 0 comment 'cp的话题id' after topic_id;

create table bilibili_pictures (
id bigint unsigned not null auto_increment primary key comment 'id',
img_src varchar(255) not null comment '图片链接',
img_attr json not null comment '图片属性',
dynamic_id bigint unsigned not null comment 'B站动态id',
feedback int default 0 comment '反馈类型',
verify boolean default false comment '是否审核过',
created_at bigint unsigned not null comment '创建时间',
updated_at bigint unsigned not null comment '更新时间',
deleted_at datetime null comment '删除时间',
index idx_dynamic_id(dynamic_id) comment '动态id索引',
index idx_img_src(img_src) comment '图片链接索引'
)Engine=InnoDB comment '动态' charset 'utf8mb4';
80 changes: 63 additions & 17 deletions internal/app/api/idl/bilibili_picture.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"encoding/json"
"errors"
"time"

"gorm.io/gorm"
)

type DynamicFeedback int
Expand All @@ -25,24 +27,60 @@ var DynamicFeedbackMap = map[DynamicFeedback]struct{}{
Uncomfortable: {},
}

// 图片库
type BilibiliPicture struct {
ID uint64 `gorm:"primarykey"`
DynamicID uint64 `gorm:"column:dynamic_id"`
ImgSrc string `gorm:"column:img_src"`
ImgAttr BilibiliPictureAttr `gorm:"column:img_attr"`
Feedback DynamicFeedback `gorm:"column:feedback"`
Verify bool `gorm:"column:verify"`
CreatedAt uint64 `gorm:"autoCreateTime"`
UpdatedAt uint64 `gorm:"autoUpdateTime"`
DeletedAt gorm.DeletedAt `gorm:"index"`
Dynamic *BilibiliDynamic `gorm:"foreignKey:DynamicID;references:DynamicID"`
}

// 图片来源于动态,以动态为单位
type BilibiliDynamic struct {
ID uint64 `gorm:"primarykey"`
UID uint64 `gorm:"column:uid"`
DynamicID uint64 `gorm:"column:dynamic_id"`
Feedback DynamicFeedback `gorm:"column:feedback"`
Verify bool `gorm:"column:verify"`
Pictures BilibiliDynamicPictures `gorm:"column:pictures"`
TopicDetails *BilibiliDynamicTopicDetails `gorm:"topic_details"`
TopicName string `gorm:"column:topic_name"`
TopicID uint64 `gorm:"column:topic_id"`
View uint64 `gorm:"column:view_nums"`
Repost uint64 `gorm:"column:repost"`
Comment uint64 `gorm:"column:comment_nums"`
Like uint64 `gorm:"column:favor"`
SentAt uint64 `gorm:"column:sent_at"`
CreatedAt uint64 `gorm:"autoCreateTime"`
UpdatedAt uint64 `gorm:"autoUpdateTime"`
ID uint64 `gorm:"primarykey"`
UID uint64 `gorm:"column:uid"`
DynamicID uint64 `gorm:"column:dynamic_id"`
Feedback DynamicFeedback `gorm:"column:feedback"`
Verify bool `gorm:"column:verify"`
Pictures BilibiliDynamicPictures `gorm:"column:pictures"`
PicturesNum int `gorm:"column:pictures_num"`
TopicDetails *BilibiliDynamicTopicDetails `gorm:"topic_details"`
TopicName string `gorm:"column:topic_name"`
TopicID uint64 `gorm:"column:topic_id"`
ActualTopicID uint64 `gorm:"column:actual_topic_id"`
CPTopicID uint64 `gorm:"column:cp_topic_id"`
View uint64 `gorm:"column:view_nums"`
Repost uint64 `gorm:"column:repost"`
Comment uint64 `gorm:"column:comment_nums"`
Like uint64 `gorm:"column:favor"`
SentAt uint64 `gorm:"column:sent_at"`
CreatedAt uint64 `gorm:"autoCreateTime"`
UpdatedAt uint64 `gorm:"autoUpdateTime"`
Pics []BilibiliPicture `gorm:"foreignKey:DynamicID;references:DynamicID"`
}

func (p BilibiliPictureAttr) Value() (driver.Value, error) {
return json.Marshal(p)
}

func (c *BilibiliPictureAttr) Scan(input interface{}) error {
data, ok := input.([]byte)
if !ok {
return errors.New("invalid input in Scan")
}
result := BilibiliPictureAttr{}
err := json.Unmarshal(data, &result)
if err != nil {
return err
}
*c = result
return nil
}

func (p BilibiliDynamicPictures) Value() (driver.Value, error) {
Expand Down Expand Up @@ -139,10 +177,14 @@ type BilibiliRandomPictureTag struct {
}
type BilibiliDynamicPictures []BilibiliDynamicPicture
type BilibiliDynamicPicture struct {
BilibiliPictureAttr
ImgSrc string `json:"img_src"`
}

type BilibiliPictureAttr struct {
Height float64 `json:"img_height"`
Size float64 `json:"img_size"`
Width float64 `json:"img_width"`
ImgSrc string `json:"img_src"`
}

type BilibiliDynamicTopicDetails []BilibiliDynamicTopicDetail
Expand All @@ -155,6 +197,10 @@ func (BilibiliDynamic) TableName() string {
return "bilibili_dynamics"
}

func (BilibiliPicture) TableName() string {
return "bilibili_pictures"
}

type BilibiliPictureRepository interface {
Create(items []*BilibiliDynamic) error
FindMaxDynamicID(topicName string) (*uint64, error)
Expand Down
2 changes: 1 addition & 1 deletion internal/app/api/service/bilbil_picture.go
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ func (service *BilbilPicture) Recommend(ctx context.Context, req idl.BilibiliPic
tx := service.db.WithContext(ctx)
picRepository := repository.NewBilibiliPicture(tx)
now := time.Now()
list, err := picRepository.Recommend(now.Add(-(30 * 24 * time.Hour)), now, req.Page, picRecommendDefaultSize, req.TopicID)
list, err := picRepository.Recommend(now.Add(-(15 * 24 * time.Hour)), now, req.Page, picRecommendDefaultSize, req.TopicID)
if err != nil {
return nil, err
}
Expand Down
186 changes: 103 additions & 83 deletions internal/app/spider/picture.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,100 +82,116 @@ func (p *Picture) Run(ctx context.Context) error {
}

func (p *Picture) spider() error {
//把当前数据库最大的动态ID查出来
//调用接口,将大于当前动态ID的都入DB,如果存在小于的,则可提前结束,尽量保证没有重复数据
topicsMap := map[string]uint64{
bilibili.TopicNameGoGo: bilibili.TopicIDGoGo,
bilibili.TopicNameMino: bilibili.TopicIDMino,
bilibili.TopicNameUn: bilibili.TopicIDUn,
bilibili.TopicNameMoMo: bilibili.TopicIDMoMo,
bilibili.TopicNameWan: bilibili.TopicIDWan,
bilibili.TopicNameEOE: bilibili.TopicIDEOE,
topicsMap := map[uint64][]uint64{
bilibili.TopicIDGoGo: {bilibili.TopicIDGoGo, 28039056, 28039057, 28621067, 30029596, 30387922},
bilibili.TopicIDMino: {bilibili.TopicIDMino, 28045077, 28611311, 31329504, 28235940, 28197598},
bilibili.TopicIDUn: {bilibili.TopicIDUn, 28187701},
bilibili.TopicIDMoMo: {bilibili.TopicIDMoMo, 28055152, 28077478, 28298854, 28535695},
bilibili.TopicIDWan: {bilibili.TopicIDWan, 17283297, 28044522, 28653712, 28909298, 29297260, 31565489},
bilibili.TopicIDEOE: {bilibili.TopicIDEOE, 28627394},
}
black := map[uint64]struct{}{
383884380: {}, //水图太多,大部分是食物图
}
for topicName, topicID := range topicsMap {
curMaxDynamicID, err := repository.NewBilibiliPicture(p.db).FindMaxDynamicID(topicName)
cpMap := map[uint64]uint64{
28909298: bilibili.TopicIDGoGo,
29297260: bilibili.TopicIDMino,
31565489: bilibili.TopicIDMino,
30387922: bilibili.TopicIDMoMo,
28235940: bilibili.TopicIDUn,
28535695: bilibili.TopicIDUn,
28197598: bilibili.TopicIDMoMo,
}
standard := time.Now().Add(-(time.Hour * 24)).Unix()
for topicID, vec := range topicsMap {
for _, actualTopID := range vec {
var cpTopicID uint64
res, ok := cpMap[actualTopID]
if ok {
cpTopicID = res
}
p.get(topicID, actualTopID, cpTopicID, uint64(standard), black)
}
}
return nil
}

func (p *Picture) get(topicID, actualTopID, cpTopicID, standard uint64, black map[uint64]struct{}) {
var hasMore uint = 1
var offset uint64 = 0
exist := false //判断有没有已经爬过
for hasMore == 1 && !exist {
time.Sleep(400 * time.Millisecond)
data, err := p.sdk.TopicDynamics(actualTopID, offset)
if err != nil {
p.logger.Error("FindMaxDynamicID error", zap.String("topic_name", topicName), zap.Error(err))
continue
p.logger.Error("TopicDynamics error", zap.Uint64("topic_id", actualTopID), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
time.Sleep(500 * time.Millisecond)
break
}
var hasMore uint = 1
var offset uint64 = 0
exist := false //判断有没有已经爬过
for hasMore == 1 && !exist {
time.Sleep(400 * time.Millisecond)
data, err := p.sdk.TopicDynamics(topicName, offset)
if err != nil {
p.logger.Error("TopicDynamics error", zap.String("topic_name", topicName), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
time.Sleep(500 * time.Millisecond)
break
}
hasMore = data.HasMore
dynamicID, err := strconv.ParseUint(data.Offset, 10, 64)
if err == nil {
offset = dynamicID
} else {
hasMore = 0
}
items := make([]*idl.BilibiliDynamic, 0)
for _, v := range data.Cards {
switch v.Desc.Type {
case bilibili.DynamicDraw:
if v.Desc.DynamicID <= *curMaxDynamicID {
//后面所有的都是爬过的,提前结束,后续也不再请求api
exist = true
break
}
if _, ok := black[v.Desc.UID]; ok {
continue
}
dynamic := &idl.BilibiliDynamic{
UID: v.Desc.UID,
DynamicID: v.Desc.DynamicID,
TopicName: topicName,
TopicID: topicID,
View: v.Desc.View,
Repost: v.Desc.Repost,
Comment: v.Desc.Comment,
Like: v.Desc.Like,
SentAt: v.Desc.TimeStamp,
}
pictures, err := parsePicturesFromCard(v.Card)
if err != nil {
p.logger.Error("ParsePicturesFromCard error", zap.String("topic_name", topicName), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
continue
}
if len(pictures) == 0 {
continue
}
dynamic.Pictures = pictures
topicDetails := make(idl.BilibiliDynamicTopicDetails, 0)
for _, v := range v.Display.TopicInfo.TopicDetails {
topicDetails = append(topicDetails, idl.BilibiliDynamicTopicDetail{
TopicID: v.TopicID,
TopicName: v.TopicName,
})
}
if len(topicDetails) != 0 {
dynamic.TopicDetails = &topicDetails
}
items = append(items, dynamic)
default:
hasMore = data.HasMore
dynamicID, err := strconv.ParseUint(data.Offset, 10, 64)
if err == nil {
offset = dynamicID
} else {
hasMore = 0
}
items := make([]*idl.BilibiliDynamic, 0)
for _, v := range data.Cards {
switch v.Desc.Type {
case bilibili.DynamicDraw:
if v.Desc.TimeStamp <= uint64(standard) {
//后面所有的都是爬过的,提前结束,后续也不再请求api
exist = true
break
}
if _, ok := black[v.Desc.UID]; ok {
continue
}
}
//插入数据
if len(items) != 0 {
err := repository.NewBilibiliPicture(p.db).Create(items)
dynamic := &idl.BilibiliDynamic{
UID: v.Desc.UID,
DynamicID: v.Desc.DynamicID,
TopicID: topicID,
ActualTopicID: actualTopID,
CPTopicID: cpTopicID,
View: v.Desc.View,
Repost: v.Desc.Repost,
Comment: v.Desc.Comment,
Like: v.Desc.Like,
SentAt: v.Desc.TimeStamp,
}
pictures, err := parsePicturesFromCard(v.Card)
if err != nil {
p.logger.Error("Create bilibli_pictures error", zap.String("topic_name", topicName), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
p.logger.Error("ParsePicturesFromCard error", zap.Uint64("topic_id", actualTopID), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
continue
}
if len(pictures) == 0 {
continue
}
dynamic.Pictures = pictures
dynamic.PicturesNum = len(pictures)
topicDetails := make(idl.BilibiliDynamicTopicDetails, 0)
for _, v := range v.Display.TopicInfo.TopicDetails {
topicDetails = append(topicDetails, idl.BilibiliDynamicTopicDetail{
TopicID: v.TopicID,
TopicName: v.TopicName,
})
}
if len(topicDetails) != 0 {
dynamic.TopicDetails = &topicDetails
}
items = append(items, dynamic)
default:
continue
}
}
//插入数据
if len(items) != 0 {
err := repository.NewBilibiliPicture(p.db).Create(items)
if err != nil {
p.logger.Error("Create bilibli_pictures error", zap.Uint64("topic_id", actualTopID), zap.String("offset", fmt.Sprintf("%d", offset)), zap.Error(err))
}
}
}
return nil
}

func parsePicturesFromCard(data string) ([]idl.BilibiliDynamicPicture, error) {
Expand All @@ -185,12 +201,16 @@ func parsePicturesFromCard(data string) ([]idl.BilibiliDynamicPicture, error) {
}
pics := make([]idl.BilibiliDynamicPicture, 0, len(content.Item.Pictures))
for _, v := range content.Item.Pictures {
pics = append(pics, idl.BilibiliDynamicPicture{
attr := idl.BilibiliPictureAttr{
Height: v.Height,
Size: v.Size,
Width: v.Width,
}
p := idl.BilibiliDynamicPicture{
ImgSrc: v.ImgSrc,
})
}
p.BilibiliPictureAttr = attr
pics = append(pics, p)
}
return pics, nil
}
2 changes: 1 addition & 1 deletion internal/app/spider/update_dynamic.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (u *UpdateDynamic) spider() error {

size := 100
for p := 1; true; p++ {
list, err := repo.FindAllByPubDate(time.Now().Add(-(3 * 24 * time.Hour)), time.Now(), int64(p), int64(size))
list, err := repo.FindAllByPubDate(time.Now().Add(-(15 * 24 * time.Hour)), time.Now(), int64(p), int64(size))
if err != nil {
u.logger.Error("[UpdateDynamic spider()]FindAllByPubDate error", zap.Int("page", p), zap.Error(err))
return nil
Expand Down
3 changes: 1 addition & 2 deletions internal/pkg/bilibili/api_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ func TestDynamic(t *testing.T) {

func TestDynamicList(t *testing.T) {
sdk := NewSDK(&zap.Logger{})
name := "EOE的魔法盒"
offset := 0
res, err := sdk.TopicDynamics(name, uint64(offset))
res, err := sdk.TopicDynamics(TopicIDEOE, uint64(offset))
if err != nil {
t.Error(err)
return
Expand Down
Loading

0 comments on commit 18dcedf

Please sign in to comment.