Skip to content

Commit da65f45

Browse files
committed
Your commit message
1 parent 029d779 commit da65f45

File tree

177 files changed

+393049
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

177 files changed

+393049
-0
lines changed

.gitignore

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# local #
2+
tmp*/
3+
cache/*
4+
*/cache*/
5+
tmp*.py
6+
tmp*
7+
*pickle
8+
data/
9+
10+
# Zip Files/Packages #
11+
*.7z
12+
*.dmg
13+
*.gz
14+
*.iso
15+
*.jar
16+
*.rar
17+
*.tar
18+
*.zip
19+
20+
# Logs and databases #
21+
*.log
22+
*.sql
23+
*.sqlite
24+
.ipynb_checkpoints/
25+
*.swp
26+
*.vscode/
27+
*.idea/
28+
*.pyc
29+
__pycache__
30+
slurm*out
31+
32+
# OS files #
33+
.DS_Store
34+
.DS_Store?
35+
._*
36+
.Spotlight-V100
37+
.Trashes
38+
ehthumbs.db
39+
Thumbs.db
40+
41+
42+
.vim-arsync
43+
scratch.norg
44+
sync_to_red.sh
45+
46+
anno/
47+
wandb/
48+
logs/
49+
*.pth
50+
51+
# personal
52+
test.ipynb
53+
54+
jupyter/
55+
56+
phoenix-slurm*
57+
batchscript-*
58+
59+
debug*

configs/config.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"model": {
3+
"model_cls": "VideoChat2_it",
4+
"vit_blip_model_path": "your_model_path/umt_l16_qformer.pth",
5+
"llama_model_path": "your_model_path/vicuna-7b-v0",
6+
"videochat2_model_path": "your_model_path/videochat2_7b_stage2.pth",
7+
"freeze_vit": false,
8+
"freeze_qformer": false,
9+
"max_txt_len": 512,
10+
"low_resource": false,
11+
"vision_encoder": {
12+
"name": "vit_l14",
13+
"img_size": 224,
14+
"patch_size": 16,
15+
"d_model": 1024,
16+
"encoder_embed_dim": 1024,
17+
"encoder_depth": 24,
18+
"encoder_num_heads": 16,
19+
"drop_path_rate": 0.0,
20+
"num_frames": 8,
21+
"tubelet_size": 1,
22+
"use_checkpoint": false,
23+
"checkpoint_num": 0,
24+
"pretrained": "",
25+
"return_index": -2,
26+
"vit_add_ln": true,
27+
"ckpt_num_frame": 4
28+
},
29+
"num_query_token": 32,
30+
"qformer_hidden_dropout_prob": 0.1,
31+
"qformer_attention_probs_dropout_prob": 0.1,
32+
"qformer_drop_path_rate": 0.2,
33+
"extra_num_query_token": 64,
34+
"qformer_text_input": true,
35+
"system": "",
36+
"start_token": "<Video>",
37+
"end_token": "</Video>",
38+
"img_start_token": "<Image>",
39+
"img_end_token": "</Image>",
40+
"random_shuffle": true,
41+
"use_lora": false,
42+
"lora_r": 16,
43+
"lora_alpha": 32,
44+
"lora_dropout": 0.1
45+
},
46+
"device": "cuda"
47+
}

configs/config_bert.json

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"architectures": [
3+
"BertForMaskedLM"
4+
],
5+
"attention_probs_dropout_prob": 0.1,
6+
"hidden_act": "gelu",
7+
"hidden_dropout_prob": 0.1,
8+
"hidden_size": 768,
9+
"initializer_range": 0.02,
10+
"intermediate_size": 3072,
11+
"layer_norm_eps": 1e-12,
12+
"max_position_embeddings": 512,
13+
"model_type": "bert",
14+
"num_attention_heads": 12,
15+
"num_hidden_layers": 12,
16+
"pad_token_id": 0,
17+
"type_vocab_size": 2,
18+
"vocab_size": 30522,
19+
"fusion_layer": 9,
20+
"encoder_width": 768,
21+
"cross_module": "ca"
22+
}

configs/config_mistral.json

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
{
2+
"model": {
3+
"model_cls": "VideoChat2_it_mistral",
4+
"vit_blip_model_path": "/path_to_the_timesuite_root_folder/download/parameters/umt_l16_qformer.pth",
5+
"mistral_model_path": "/path_to_the_timesuite_root_folder/download/parameters/Mistral-7B-Instruct-v0.2",
6+
"videochat2_model_path": "/path_to_the_timesuite_root_folder/download/parameters/videochat2_mistral_7b_stage2.pth",
7+
"freeze_vit": false,
8+
"freeze_qformer": false,
9+
"max_txt_len": 512,
10+
"low_resource": false,
11+
"vision_encoder": {
12+
"name": "vit_l14",
13+
"img_size": 224,
14+
"patch_size": 16,
15+
"d_model": 1024,
16+
"encoder_embed_dim": 1024,
17+
"encoder_depth": 24,
18+
"encoder_num_heads": 16,
19+
"drop_path_rate": 0.0,
20+
"num_frames": 8,
21+
"tubelet_size": 1,
22+
"use_checkpoint": true,
23+
"checkpoint_num": 18,
24+
"pretrained": "",
25+
"return_index": -2,
26+
"vit_add_ln": true,
27+
"ckpt_num_frame": 4
28+
},
29+
"num_query_token": 32,
30+
"qformer_hidden_dropout_prob": 0.1,
31+
"qformer_attention_probs_dropout_prob": 0.1,
32+
"qformer_drop_path_rate": 0.2,
33+
"extra_num_query_token": 64,
34+
"qformer_text_input": true,
35+
"system": "",
36+
"start_token": "<Video>",
37+
"end_token": "</Video>",
38+
"add_second_msg": true,
39+
"img_start_token": "<Image>",
40+
"img_end_token": "</Image>",
41+
"random_shuffle": true,
42+
"return_question_instruction": false,
43+
"use_flash_attention": true,
44+
"use_lora": false,
45+
"lora_r": 16,
46+
"lora_alpha": 32,
47+
"lora_dropout": 0.1
48+
},
49+
"device": "cuda"
50+
}

configs/instruction_data.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
import os as __os # add "__" if not want to be exported
2+
from copy import deepcopy as __deepcopy
3+
4+
anno_root_it = "/path_to_the_timesuite_root_folder/download/datasets/TimePro"
5+
6+
7+
# ============== pretraining datasets=================
8+
available_corpus = dict(
9+
10+
caption_youcook2=[
11+
f"{anno_root_it}/caption_youcook2.json",
12+
"pnorm2:s3://youcook2/split_videos",
13+
"video"
14+
],
15+
conversation_videochat1=[
16+
f"{anno_root_it}/conversation_videochat1.json",
17+
"pnorm2:s3://webvid10m",
18+
"video"
19+
],
20+
conversation_videochat2=[
21+
f"{anno_root_it}/conversation_videochat2.json",
22+
"pnorm:s3://videointernsegvideos",
23+
"video"
24+
],
25+
conversation_videochatgpt=[
26+
f"{anno_root_it}/conversation_videochatgpt.json",
27+
"pnorm2:s3://anet/ANet_320p_fps30",
28+
"video"
29+
],
30+
reasoning_star=[
31+
f"{anno_root_it}/reasoning_star.json",
32+
"pnorm2:s3://star/Charades_v1_480",
33+
"video"
34+
],
35+
vqa_ego_qa=[
36+
f"{anno_root_it}/vqa_ego_qa.json",
37+
"pnorm2:s3://egoqa/split_videos",
38+
"video"
39+
],
40+
41+
42+
43+
44+
# TimeIT
45+
timeit_ANet=[
46+
f"{anno_root_it}/timeit_ANet.json",
47+
"pnorm2:s3://anet",
48+
"video"
49+
],
50+
51+
timeit_COIN=[
52+
f"{anno_root_it}/timeit_COIN.json",
53+
"pnorm:s3://COIN_320p",
54+
"video"
55+
],
56+
57+
timeit_DiDeMo=[
58+
f"{anno_root_it}/timeit_DiDeMo.json",
59+
"sssd:s3://yjsBucket",
60+
"video"
61+
],
62+
63+
timeit_HiREST=[
64+
f"{anno_root_it}/timeit_HiREST.json",
65+
"pnorm2zxy:s3://hirest",
66+
"video"
67+
],
68+
69+
70+
timeit_QuerYD=[
71+
f"{anno_root_it}/timeit_QuerYD.json",
72+
"pnorm2zxy:s3://queryd",
73+
"video"
74+
],
75+
76+
timeit_TVSum=[
77+
f"{anno_root_it}/timeit_TVSum.json",
78+
"pnorm2zxy:s3://tvsum",
79+
"video"
80+
],
81+
82+
timeit_ViTT=[
83+
f"{anno_root_it}/timeit_ViTT.json",
84+
"sssd:s3://ViTT",
85+
"video"
86+
],
87+
88+
timeit_yttemporal180m=[
89+
f"{anno_root_it}/timeit_yttemporal180m.json",
90+
"pnorm:s3://YT-Temporal-180M",
91+
"video"
92+
],
93+
94+
grounding_ANetRTL=[
95+
f"{anno_root_it}/grounding_ANetRTL.json",
96+
"pnorm2:s3://anet/ANet_320p_fps30/train",
97+
"video"
98+
],
99+
100+
grounding_IntrenvidVTime_100K=[
101+
f"{anno_root_it}/grounding_IntrenvidVTime_100K.json",
102+
"pnorm:s3://youtubeBucket/videos/",
103+
"video"
104+
],
105+
grounding_ANetHL2=[
106+
f"{anno_root_it}/grounding_ANetHL2.json",
107+
"pnorm2:s3://anet/ANet_320p_fps30/train",
108+
"video"
109+
],
110+
111+
grounding_CosmoCap_93K=[
112+
f"{anno_root_it}/grounding_CosmoCap_93K.json",
113+
"pvideo:s3://howto100m/",
114+
"video"
115+
],
116+
vqa_moviechat = [
117+
f'{anno_root_it}/vqa_moviechat.json',
118+
'pnorm2:s3://MovieChat/real_video/',
119+
'video'
120+
],
121+
caption_moviechat = [
122+
f'{anno_root_it}/caption_moviechat.json',
123+
'pnorm2:s3://MovieChat/real_video/',
124+
'video'
125+
],
126+
127+
)
128+
129+
130+
available_corpus["TimePro_Normal"] = [ #final dataset
131+
#TiIT
132+
available_corpus["timeit_ANet"],
133+
available_corpus["timeit_COIN"],
134+
available_corpus["timeit_DiDeMo"],
135+
available_corpus["timeit_HiREST"],
136+
available_corpus["timeit_QuerYD"],
137+
available_corpus["timeit_TVSum"],
138+
available_corpus["timeit_ViTT"],
139+
available_corpus["timeit_yttemporal180m"],
140+
#Conv
141+
available_corpus["conversation_videochatgpt"],
142+
available_corpus["conversation_videochat2"],
143+
available_corpus["conversation_videochat1"],
144+
#DvcVqa
145+
available_corpus["caption_youcook2"],
146+
available_corpus["vqa_ego_qa"],
147+
#Gro
148+
available_corpus["grounding_ANetRTL"],
149+
available_corpus["grounding_IntrenvidVTime_100K"],
150+
available_corpus["grounding_ANetHL2"],
151+
available_corpus["grounding_CosmoCap_93K"],
152+
available_corpus["vqa_moviechat"],
153+
available_corpus["caption_moviechat"],
154+
available_corpus["reasoning_star"],
155+
]

configs/model.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
TextEncoders = dict()
2+
TextEncoders["bert"] = dict(
3+
name="bert_base",
4+
pretrained="bert-base-uncased",
5+
config="configs/config_bert.json",
6+
d_model=768,
7+
fusion_layer=9,
8+
)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../../../../../.cache/huggingface/hub/datasets--ShuhuaiRen--TimeIT/blobs/36470518c0a555bbc7e7ae0b30393441ec533e03
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../../../../../.cache/huggingface/hub/datasets--ShuhuaiRen--TimeIT/blobs/2d6aad3236b910b1877aa8058dd0be19b3f333b7cefebd1f6c852880d13a6dc3

dataset/TimeIT/dense_video_captioning/anet/test.caption_coco_format.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

dataset/TimeIT/dense_video_captioning/anet/train.caption_coco_format.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

dataset/TimeIT/dense_video_captioning/anet/val.caption_coco_format.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"0": "Localize a series of activity events in the video, output the start and end timestamp for each event, and describe each event with sentences. The output format of each predicted event should be like: 'start - end seconds, event description'. A specific example is: ' 90 - 102 seconds, spread margarine on two slices of white bread in the video'.",
3+
"1": "Determine the start and end times of various activity events in the video, accompanied by descriptions.",
4+
"2": "Capture and describe the activity events in the given video, specifying their respective time intervals, and outputting the time intervals in the 'start - end seconds format'.",
5+
"3": "Identify, timestamp, and describe various activity events occurring in the video. The timestamp should include the start time and end time in seconds.",
6+
"4": "Detect and report the start and end timestamps of activity events in the video, along with descriptions.",
7+
"5": "Pinpoint the time intervals of activity events in the video, and provide detailed descriptions for each event."
8+
}

dataset/TimeIT/dense_video_captioning/vitt/instruct_dvc_5.1k_vitt.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../../../../../.cache/huggingface/hub/datasets--ShuhuaiRen--TimeIT/blobs/1dc9787ee6fa38f8c3223b14eb10da0efdfa1c17ef9f0dea77fafd5425a5c5dc

dataset/TimeIT/dense_video_captioning/youcook2/instruct_dvc_1.2k_youcook2.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)