-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcardata.sh
executable file
·61 lines (47 loc) · 1.68 KB
/
cardata.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
# author: SEB
# auto-data.net car data crawler. run with cardata_starter.sh
# redirect stderr stream (preferably to /dev/null) because xidel uses stderr for redundant messages, but this script uses stdout for errors.
if [ "$#" -ne 2 ]; then
echo "Usage: <from> (inclusive) <to> (exclusive)"; exit 1
fi
if [ ! -d "cardata" ]; then
mkdir cardata
fi
#for (( i=1; i<=19144; i++))
for (( i="$1"; i<"$2"; i++))
do
url='http://www.auto-data.net/tr/?f=showCar&car_id='"$i"
#create temp file for wget header information dump
tempfile=`mktemp`
#download content & write headers (-S) to tempfile.
content=`wget --user-agent="Mozilla/5.0 (Windows NT 5.2; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" -q -O - -S "$url" 2>"$tempfile"`
#check wget return value is 0.
ret_val="$?"
if [ "$ret_val" -ne 0 ]; then
rm "$tempfile" #delete temp file
echo "error_wget $ret_val $url"
continue
fi
#check http status code
ret_val=`cat "$tempfile" | grep "HTTP" | awk '{print $2}'`
rm "$tempfile" #delete temp file
if [ "$ret_val" -ne 200 ]; then
echo "error_http $ret_val $url"
continue
fi
#strip title of page and use it as filename
title=`echo "$content" | ./xidel - -e //title | sed 's/ - Teknik özellikler, yakıt tüketimi//g; s/\// /g'`
filename="cardata/$title"'_'"$i.xml"
#parse & extract info with xidel
echo "$content" | ./xidel - --extract-file=template.html --output-format=xml-wrapped > "$filename"
#check xidel return value is 0.
ret_val="$?"
if [ "$ret_val" -ne 0 ]; then
rm "$filename" #delete output file
echo "error_xidel $ret_val $url"
continue
fi
#remove object tags, inplace sed
sed -i'' 's/<object>//g; s/<\/object>//g' "$filename"
done