-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathpreprocess-data.sh
51 lines (38 loc) · 1.99 KB
/
preprocess-data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
cd YouTube2Text/youtubeclips-dataset
# Clean Up files if they already exist
rm -f clean_descriptions.csv
rm -f matched_descriptions.csv
rm -f vocabulary_clean.txt
rm -f vidnames.txt
# Filter out clean descriptions if required
if [ "$1" != "CleanOnly" ]; then
cat video-descriptions.csv | awk -F, '{print $1"_"$2"_"$3","$8}' > clean_descriptions.csv
else
grep -E ",clean," video-descriptions.csv | awk -F, '{print $1"_"$2"_"$3","$8}' > clean_descriptions.csv
fi
# Match descriptions to videos
sed 's/ /,/g' youtube_mapping.txt > youtube_mapping.csv
join -t ',' <(sort youtube_mapping.csv) <(sort clean_descriptions.csv) | awk -F, '{print $2","$3}' > matched_descriptions.csv
# Clean descriptions
rm -f matched_descriptions_symbolfree.csv
rm -f bad_descriptions.csv
rm -f cleaned_descriptions.csv
## Remove Symbols
sed 's/\.$//' matched_descriptions.csv | sed 's/\!$//' | sed 's/"/ /g' | tr '`' "'" | tr "[" " " | tr "]" " " | tr "/" " " | tr "(" " " | tr ")" " " | tr " " " " > matched_descriptions_symbolfree.csv
cat matched_descriptions_symbolfree.csv | grep "[^0-9A-Za-z,\. '&-]" > bad_descriptions.csv
grep -v -x -f bad_descriptions.csv matched_descriptions_symbolfree.csv > symbolfree_descriptions.csv
## Remove Short Sentences
cat symbolfree_descriptions.csv | awk 'NF>=5' | sed "s/, /,/g" > cleaned_descriptions.csv
echo $(( $(wc -l video-descriptions.csv | awk '{print $1}') - $(wc -l cleaned_descriptions.csv | awk '{print $1}') )) "Captions deleted"
echo `wc -l cleaned_descriptions.csv` "Captions to be used"
# Create Vocabulary file
sed -e 's/<[^>]*>//g' cleaned_descriptions.csv | awk -F',' '{print $2}' | tr " " "\n" | sed '/^$/d' | sort | uniq -ci | sed 's/^ *//g' | sort -Vr | sed 's/ /,/g' > vocabulary.txt
# Make file with list of videonames
ls *avi | sort -V > vidnames.txt
# Run Python script to generate script to extract video frames
python ../../extract_frame_gen.py > ../../extract_frames.sh
cd ../..
mkdir logs
mkdir models
mkdir -p language_model/results
mkdir -p language_model/annotations