1 | #!/usr/bin/perl -w |
---|
2 | |
---|
3 | # |
---|
4 | # This perl script is intended to perform movie data lookups in german based on |
---|
5 | # the www.ofdb.de website |
---|
6 | # |
---|
7 | # For more information on MythVideo's external movie lookup mechanism, see |
---|
8 | # the README file in this directory. |
---|
9 | # |
---|
10 | # Author: Xavier Hervy (maxpower44 AT tiscali DOT fr) |
---|
11 | # |
---|
12 | |
---|
13 | use LWP::Simple; # libwww-perl providing simple HTML get actions |
---|
14 | use HTML::Entities; |
---|
15 | use URI::Escape; |
---|
16 | #use utf8; |
---|
17 | |
---|
18 | use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P); |
---|
19 | use Getopt::Std; |
---|
20 | |
---|
21 | $title = "Ofdb Query"; |
---|
22 | $version = "v1.00"; |
---|
23 | $author = "Xavier Hervy"; |
---|
24 | |
---|
25 | # display usage |
---|
26 | sub usage { |
---|
27 | print "usage: $0 -hdrviMPD [parameters]\n"; |
---|
28 | print " -h help\n"; |
---|
29 | print " -d debug\n"; |
---|
30 | print " -r dump raw query result data only\n"; |
---|
31 | print " -v display version\n"; |
---|
32 | print " -i display info\n"; |
---|
33 | print "\n"; |
---|
34 | print " -M <query> get movie list\n"; |
---|
35 | print " -D <movieid> get movie data\n"; |
---|
36 | print " -P <movieid> get movie poster\n"; |
---|
37 | exit(-1); |
---|
38 | } |
---|
39 | |
---|
40 | # display 1-line of info that describes the version of the program |
---|
41 | sub version { |
---|
42 | print "$title ($version) by $author\n" |
---|
43 | } |
---|
44 | |
---|
45 | # display 1-line of info that can describe the type of query used |
---|
46 | sub info { |
---|
47 | print "Performs queries using the www.ofdb.de website.\n"; |
---|
48 | } |
---|
49 | |
---|
50 | # display detailed help |
---|
51 | sub help { |
---|
52 | version(); |
---|
53 | info(); |
---|
54 | usage(); |
---|
55 | } |
---|
56 | |
---|
57 | # returns text within 'data' between 'beg' and 'end' matching strings |
---|
58 | sub parseBetween { |
---|
59 | my ($data, $beg, $end)=@_; # grab parameters |
---|
60 | |
---|
61 | my $ldata = lc($data); |
---|
62 | my $start = index($ldata, lc($beg)) + length($beg); |
---|
63 | my $finish = index($ldata, lc($end), $start); |
---|
64 | |
---|
65 | #my $ldata = $data; |
---|
66 | #my $start = index($ldata, $beg) + length($beg); |
---|
67 | #my $finish = index($ldata, $end, $start); |
---|
68 | |
---|
69 | #print "$start $finish\n"; |
---|
70 | if ($start != (length($beg) -1) && $finish != -1) { |
---|
71 | my $result = substr($data, $start, $finish - $start); |
---|
72 | # dont use decode entities &npsp; => spï¿œial characters bug in html::entities ? |
---|
73 | #decode_entities($result); |
---|
74 | return removenbsp($result); |
---|
75 | } |
---|
76 | return ""; |
---|
77 | } |
---|
78 | |
---|
79 | # use to replace by " " (instead of decode_entities) |
---|
80 | sub removenbsp { |
---|
81 | my ($data)=@_; # grab parameters |
---|
82 | |
---|
83 | my $ldata = lc($data); |
---|
84 | my $start = index($ldata, " "); |
---|
85 | while ($start != -1){ |
---|
86 | $data = substr($data, 0, $start). " " .substr($data, $start+6, length($data)); |
---|
87 | $ldata = lc($data); |
---|
88 | $start = index($ldata, " "); |
---|
89 | } |
---|
90 | return $data; |
---|
91 | } |
---|
92 | |
---|
93 | |
---|
94 | # returns text within 'data' without tag |
---|
95 | sub removeTag { |
---|
96 | my ($data)=@_; # grab parameters |
---|
97 | |
---|
98 | my $ldata = lc($data); |
---|
99 | my $start = index($ldata, "<"); |
---|
100 | my $finish = index($ldata, ">", $start)+1; |
---|
101 | while ($start != -1 && $finish != -1){ |
---|
102 | $data = substr($data, 0, $start).substr($data, $finish, length($data)); |
---|
103 | $ldata = lc($data); |
---|
104 | $start = index($ldata, "<"); |
---|
105 | $finish = index($ldata, ">", $start)+1; |
---|
106 | } |
---|
107 | return $data; |
---|
108 | } |
---|
109 | |
---|
110 | # get Movie Data |
---|
111 | sub getMovieData { |
---|
112 | my ($movieid)=@_; # grab movieid parameter |
---|
113 | if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);} |
---|
114 | |
---|
115 | # get the search results page |
---|
116 | my $request = "http://www.ofdb.de/view.php?page=film&fid=" . $movieid; |
---|
117 | if (defined $opt_d) { printf("# request: '%s'\n", $request); } |
---|
118 | my $response = get $request; |
---|
119 | #print "$response\n"; |
---|
120 | if (defined $opt_r) { printf("%s", $response); } |
---|
121 | |
---|
122 | # parse title and year |
---|
123 | my $title = parseBetween($response, "<font face=\"Arial,Helvetica,sans-serif\" size=\"3\"><b>","</b></font></td>"); |
---|
124 | #print "titre = $title\n"; |
---|
125 | my $year = parseBetween($response,"<a href=\"view.php?page=blaettern&Kat=Jahr&Text=","\">"); |
---|
126 | # $year = parseBetween($year,"<font >(",")"); |
---|
127 | |
---|
128 | # parse director |
---|
129 | my $director = parseBetween($response,"Regie:","</tr>"); |
---|
130 | $director = parseBetween($director,"\">","</a>"); |
---|
131 | #print "Director $director"; |
---|
132 | $director = removeTag($director); |
---|
133 | |
---|
134 | # parse user rating |
---|
135 | my $userrating = parseBetween($response, "Note: ", " "); |
---|
136 | |
---|
137 | # parse cast |
---|
138 | my $cast = parseBetween($response,"Darsteller:","</b>"); |
---|
139 | #$cast = parseBetween($cast,"Daten\"><b>","</b>..."); |
---|
140 | $cast =~ s/<br><a/,<a/g; |
---|
141 | # remove linebreaks and empty space: |
---|
142 | $cast =~ s/\n//g; |
---|
143 | 1 while ($cast =~ s/\s\s//g); |
---|
144 | |
---|
145 | $cast = removeTag($cast); |
---|
146 | |
---|
147 | #genres |
---|
148 | $genres = parseBetween($response,"Genre(s):","></font></td>"); |
---|
149 | $genres = parseBetween($genres,"class=\"Daten\"><b>","</b"); |
---|
150 | $genres =~ s/<br><a/,<a/g; |
---|
151 | $genres = removeTag($genres); |
---|
152 | |
---|
153 | #countries |
---|
154 | my $countries = parseBetween($response,"Herstellungsland","</tr>"); |
---|
155 | $countries = parseBetween($countries,"Daten\"><b>","</td>"); |
---|
156 | $countries =~ s/<br><a/,<a/g; |
---|
157 | $countries = removeTag($countries); |
---|
158 | |
---|
159 | # parse plot |
---|
160 | my $plot = parseBetween($response,"Inhalt:","[mehr]"); |
---|
161 | $plot = removeTag($plot); |
---|
162 | my $ploturl = parseBetween($response,"view.php?page=inhalt","\"><b>[mehr]"); |
---|
163 | |
---|
164 | |
---|
165 | my $runtime = 0; |
---|
166 | my $movierating = ""; |
---|
167 | my $writer = ""; |
---|
168 | #runtime provide from german.imdb.com |
---|
169 | my $urlimdb = parseBetween($response,"http://german.imdb.com/Title?","\" target"); |
---|
170 | if ($urlimdb eq ""){ |
---|
171 | }else{ |
---|
172 | $request = "http://german.imdb.com/Title?".$urlimdb; |
---|
173 | $response = get $request; |
---|
174 | #parse movie length |
---|
175 | $runtime = parseBetween($response,"Länge:</b>\n"," min "); |
---|
176 | |
---|
177 | #parse movie rating |
---|
178 | $movierating = parseBetween($response,"Altersfreigabe:</b>\n"," \n<br>"); |
---|
179 | $movierating = removeTag($movierating); |
---|
180 | |
---|
181 | #parse writer (only the first) |
---|
182 | $writer = parseBetween($response,"<b class=\"blackcatheader\">Buch</b>\n\n<br>\n"," <br>"); |
---|
183 | $writer = parseBetween($writer,">","</a>"); |
---|
184 | } |
---|
185 | |
---|
186 | # parse plot |
---|
187 | if ($ploturl eq ""){ |
---|
188 | } |
---|
189 | else{ |
---|
190 | $request = "http://www.ofdb.de/view.php?page=inhalt" . $ploturl; |
---|
191 | $response = get $request; |
---|
192 | $response = parseBetween($response,"</a></b><br><br>","</font></p>"); |
---|
193 | if ($response eq ""){ |
---|
194 | }else{ |
---|
195 | $plot=$response; |
---|
196 | } |
---|
197 | } |
---|
198 | |
---|
199 | |
---|
200 | # output fields (these field names must match what MythVideo is looking for) |
---|
201 | print "Title:$title\n"; |
---|
202 | print "Year:$year\n"; |
---|
203 | print "Director:$director\n"; |
---|
204 | print "Plot:$plot\n"; |
---|
205 | print "UserRating:$userrating\n"; |
---|
206 | print "MovieRating:$movierating\n"; |
---|
207 | print "Runtime:$runtime\n"; |
---|
208 | print "Writers: $writer\n"; |
---|
209 | print "Cast: $cast\n"; |
---|
210 | print "Genres:$genres\n"; |
---|
211 | print "Countries:$countries\n"; |
---|
212 | } |
---|
213 | |
---|
214 | # dump Movie Poster |
---|
215 | sub getMoviePoster { |
---|
216 | my ($movieid)=@_; # grab movieid parameter |
---|
217 | if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);} |
---|
218 | |
---|
219 | # get the search results page |
---|
220 | my $request = "http://www.ofdb.de/view.php?page=film&fid=" . $movieid; |
---|
221 | if (defined $opt_d) { printf("# request: '%s'\n", $request); } |
---|
222 | my $response = get $request; |
---|
223 | if (defined $opt_r) { printf("%s", $response); } |
---|
224 | |
---|
225 | my $uriofdb = ""; |
---|
226 | $uriofdb = parseBetween($response, "Linke", "Aufgerufen"); |
---|
227 | $uriofdb = parseBetween($uriofdb,"src=\"","\" alt"); |
---|
228 | if ($uriofdb eq "images/film/na.gif") { |
---|
229 | $uriofdb = ""; |
---|
230 | }else{ |
---|
231 | $uriofdb = "http://www.ofdb.de/$uriofdb\n"; |
---|
232 | } |
---|
233 | |
---|
234 | my $urlimdb = parseBetween($response,"http://german.imdb.com/Title?","\" target"); |
---|
235 | my $uri=""; |
---|
236 | if ($urlimdb eq ""){ |
---|
237 | }else{ |
---|
238 | # get the search results page |
---|
239 | my $request = "http://www.imdb.com/title/tt" . $urlimdb . "/posters"; |
---|
240 | if (defined $opt_d) { printf("# request: '%s'\n", $request); } |
---|
241 | my $response = get $request; |
---|
242 | if (defined $opt_r) { printf("%s", $response); } |
---|
243 | |
---|
244 | # look for references to impawards.com posters - they are high quality |
---|
245 | my $site = "http://www.impawards.com"; |
---|
246 | my $impsite = parseBetween($response, "<a href=\"".$site, "\">".$site); |
---|
247 | if ($impsite) { |
---|
248 | $impsite = $site . $impsite; |
---|
249 | if (defined $opt_d) |
---|
250 | { print "# Searching for poster at: ".$impsite."\n"; } |
---|
251 | my $impres = get $impsite; |
---|
252 | if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); } |
---|
253 | if (defined $opt_r) { printf("%s", $impres); } |
---|
254 | $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT"); |
---|
255 | # uri here is relative... patch it up to make a valid uri |
---|
256 | if (!($uri =~ /http:(.*)/ )) { |
---|
257 | my $path = substr($impsite, 0, rindex($impsite, '/') + 1); |
---|
258 | $uri = $path."posters/".$uri; |
---|
259 | } |
---|
260 | if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; } |
---|
261 | } |
---|
262 | |
---|
263 | # if the impawards site attempt didn't give a filename grab it from imdb |
---|
264 | if ($uri eq "") { |
---|
265 | if (defined $opt_d) { print "# looking for imdb posters\n"; } |
---|
266 | my $host = "http://posters.imdb.com/posters/"; |
---|
267 | |
---|
268 | $uri = parseBetween($response, $host, "\"><td><td><a href=\""); |
---|
269 | if ($uri ne "") { |
---|
270 | $uri = $host.$uri; |
---|
271 | } else { |
---|
272 | if (defined $opt_d) { print "# no poster found\n"; } |
---|
273 | } |
---|
274 | } |
---|
275 | } |
---|
276 | if ($uri eq ""){ |
---|
277 | print "$uriofdb\n"; |
---|
278 | }else{ |
---|
279 | print "$uri\n"; |
---|
280 | } |
---|
281 | } |
---|
282 | |
---|
283 | # dump Movie list: 1 entry per line, each line as 'movieid:Movie Title' |
---|
284 | sub getMovieList { |
---|
285 | my ($filename, $options)=@_; # grab parameters |
---|
286 | |
---|
287 | # If we wanted to inspect the file for any reason we can do that now |
---|
288 | |
---|
289 | # |
---|
290 | # Convert filename into a query string |
---|
291 | # (use same rules that Metadata::guesTitle does) |
---|
292 | my $query = $filename; |
---|
293 | $query = uri_unescape($query); # in case it was escaped |
---|
294 | # Strip off the file extension |
---|
295 | if (rindex($query, '.') != -1) { |
---|
296 | $query = substr($query, 0, rindex($query, '.')); |
---|
297 | } |
---|
298 | # Strip off anything following '(' - people use this for general comments |
---|
299 | if (rindex($query, '(') != -1) { |
---|
300 | $query = substr($query, 0, rindex($query, '(')); |
---|
301 | } |
---|
302 | # Strip off anything following '[' - people use this for general comments |
---|
303 | if (rindex($query, '[') != -1) { |
---|
304 | $query = substr($query, 0, rindex($query, '[')); |
---|
305 | } |
---|
306 | |
---|
307 | # IMDB searches do better if any trailing ,The is left off |
---|
308 | $query =~ /(.*), The$/i; |
---|
309 | if ($1) { $query = $1; } |
---|
310 | |
---|
311 | # prepare the url |
---|
312 | $query = uri_escape($query); |
---|
313 | if (!$options) { $options = "" ;} |
---|
314 | if (defined $opt_d) { |
---|
315 | printf("# query: '%s', options: '%s'\n", $query, $options); |
---|
316 | } |
---|
317 | my $count = 0; |
---|
318 | # my $typerecherche = 3; |
---|
319 | |
---|
320 | # while (($typerecherche <=5) && ($count ==0)){ |
---|
321 | # get the search results page |
---|
322 | my $request = "http://www.ofdb.de/view.php?page=suchergebnis&Kat=DTitel&SText=$query"; |
---|
323 | if (defined $opt_d) { printf("# request: '%s'\n", $request); } |
---|
324 | my $response = get $request; |
---|
325 | if (defined $opt_r) { |
---|
326 | print $response; |
---|
327 | exit(0); |
---|
328 | } |
---|
329 | |
---|
330 | |
---|
331 | # extract possible matches |
---|
332 | # possible matches are grouped in several catagories: |
---|
333 | # exact, partial, and approximate |
---|
334 | my $exact_matches = parseBetween($response, "</b><br><br>1.", |
---|
335 | "<br><br><br></font></p>"); |
---|
336 | #print "$exact_matches\n"; |
---|
337 | # parse movie list from matches |
---|
338 | my $beg = "<a href='view.php?page=film&"; |
---|
339 | my $end = "</a>"; |
---|
340 | |
---|
341 | my @movies; |
---|
342 | |
---|
343 | |
---|
344 | my $data = $exact_matches; |
---|
345 | # if ($data eq "") { |
---|
346 | # if (defined $opt_d) { printf("# no results\n"); } |
---|
347 | # $typerecherche = $typerecherche +2 ; |
---|
348 | # }else{ |
---|
349 | my $start = index($data, $beg); |
---|
350 | my $finish = index($data, $end, $start); |
---|
351 | |
---|
352 | my $title; |
---|
353 | while ($start != -1) { |
---|
354 | $start += length($beg); |
---|
355 | my $sub = substr($data, $start, $finish - $start); |
---|
356 | my $movienum = parseBetween($sub,"fid=","'>"); |
---|
357 | $title = parseBetween($sub,">","<font size='1'>"); |
---|
358 | $title = removeTag($title); |
---|
359 | $moviename = removeTag($sub); |
---|
360 | my ($movieyear)= $moviename =~/\((\d+)\)/; |
---|
361 | if ($movieyear){$title = $title." (".$movieyear.")"; } |
---|
362 | $moviename=$title ; |
---|
363 | |
---|
364 | # advance data to next movie |
---|
365 | $data = substr($data, - (length($data) - $finish)); |
---|
366 | $start = index($data, $beg); |
---|
367 | $finish = index($data, $end, $start + 1); |
---|
368 | |
---|
369 | # add to array |
---|
370 | $movies[$count++] = $movienum . ":" . $moviename; |
---|
371 | } |
---|
372 | |
---|
373 | # display array of values |
---|
374 | for $movie (@movies) { print "$movie\n"; } |
---|
375 | # } |
---|
376 | # } |
---|
377 | } |
---|
378 | |
---|
379 | # |
---|
380 | # Main Program |
---|
381 | # |
---|
382 | |
---|
383 | # parse command line arguments |
---|
384 | getopts('ohrdivDMP'); |
---|
385 | |
---|
386 | # print out info |
---|
387 | if (defined $opt_v) { version(); exit 1; } |
---|
388 | if (defined $opt_i) { info(); exit 1; } |
---|
389 | |
---|
390 | # print out usage if needed |
---|
391 | if (defined $opt_h || $#ARGV<0) { help(); } |
---|
392 | |
---|
393 | if (defined $opt_D) { |
---|
394 | # take movieid from cmdline arg |
---|
395 | $movieid = shift || die "Usage : $0 -D <movieid>\n"; |
---|
396 | getMovieData($movieid); |
---|
397 | } |
---|
398 | |
---|
399 | elsif (defined $opt_P) { |
---|
400 | # take movieid from cmdline arg |
---|
401 | $movieid = shift || die "Usage : $0 -P <movieid>\n"; |
---|
402 | getMoviePoster($movieid); |
---|
403 | } |
---|
404 | |
---|
405 | elsif (defined $opt_M) { |
---|
406 | # take query from cmdline arg |
---|
407 | $options = shift || die "Usage : $0 -M [options] <query>\n"; |
---|
408 | $query = shift; |
---|
409 | if (!$query) { |
---|
410 | $query = $options; |
---|
411 | $options = ""; |
---|
412 | } |
---|
413 | getMovieList($query, $options); |
---|
414 | } |
---|
415 | |
---|