Ticket #3679: imdb.pl

File imdb.pl, 19.0 KB (added by anonymous, 17 years ago)

imdb.pl fix

Line 
1#!/usr/bin/perl -w
2
3#
4# This perl script is intended to perform movie data lookups based on
5# the popular www.imdb.com website
6#
7# For more information on MythVideo's external movie lookup mechanism, see
8# the README file in this directory.
9#
10# Author: Tim Harvey (tharvey AT alumni.calpoly DOT edu)
11# Modified: Andrei Rjeousski
12# v1.1
13# - Added amazon.com covers and improved handling for imdb posters
14# v1.2
15#     - when searching amazon, try searching for main movie name and if nothing
16#       is found, search for informal name
17#     - better handling for amazon posters, see if movie title is a substring
18#       in the search results returned by amazon
19#     - fixed redirects for some movies on impawards
20# v1.3
21#     - fixed search for low res images (imdb changed the page layout)
22#     - added cinemablend poster search
23#     - added nexbase poster search
24#     - removed amazon.com searching for now
25
26# changes:
27# 9-10-2006: Anduin Withers
28#   Changed output to utf8
29# 30-6-2007 fix for new imdb.com design
30
31use LWP::Simple;      # libwww-perl providing simple HTML get actions
32use HTML::Entities;
33use URI::Escape;
34
35
36use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P);
37use Getopt::Std;
38
39$title = "IMDB Query";
40$version = "v1.3";
41$author = "Tim Harvey, Andrei Rjeousski";
42
43binmode(STDOUT, ":utf8");
44
45# display usage
46sub usage {
47   print "usage: $0 -hdrviMPD [parameters]\n";
48   print "       -h           help\n";
49   print "       -d           debug\n";
50   print "       -r           dump raw query result data only\n";
51   print "       -v           display version\n";
52   print "       -i           display info\n";
53   print "\n";
54   print "       -M [options] <query>    get movie list\n";
55   print "               some known options are:\n";
56   print "                  type=[fuzy]         looser search\n";
57   print "                  from_year=[int]     limit matches to year\n";
58   print "                  to_year=[int]       limit matches to year\n";
59   print "                  sort=[smart]        ??\n";
60   print "                  tv=[no|both|only]   limits between tv and movies\n";
61   print "               Note: multiple options must be separated by ';'\n";
62   print "       -P <movieid>  get movie poster\n";
63   print "       -D <movieid>  get movie data\n";
64   exit(-1);
65}
66
67# display 1-line of info that describes the version of the program
68sub version {
69   print "$title ($version) by $author\n"
70}
71
72# display 1-line of info that can describe the type of query used
73sub info {
74   print "Performs queries using the www.imdb.com website.\n";
75}
76
77# display detailed help
78sub help {
79   version();
80   info();
81   usage();
82}
83
84sub trim {
85   my ($str) = @_;
86   $str =~ s/^\s+//;
87   $str =~ s/\s+$//;
88   return $str;
89}
90
91# returns text within 'data' between 'beg' and 'end' matching strings
92sub parseBetween {
93   my ($data, $beg, $end)=@_; # grab parameters
94
95   my $ldata = lc($data);
96   my $start = index($ldata, lc($beg)) + length($beg);
97   my $finish = index($ldata, lc($end), $start);
98   if ($start != (length($beg) -1) && $finish != -1) {
99      my $result = substr($data, $start, $finish - $start);
100      # return w/ decoded numeric character references
101      # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1)
102      decode_entities($result);
103      return $result;
104   }
105   return "";
106}
107
108# get Movie Data
109sub getMovieData {
110   my ($movieid)=@_; # grab movieid parameter
111   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
112
113   my $name_link_pat = qr'<a href="/name/[^"]*">([^<]*)</a>'m;
114
115   # get the search results  page
116   my $request = "http://www.imdb.com/title/tt" . $movieid . "/";
117   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
118   my $response = get $request;
119   if (defined $opt_r) { printf("%s", $response); }
120
121   # parse title and year
122   my $year = "";
123   my $title = parseBetween($response, "<title>", "</title>");
124   if ($title =~ m#(.+) \((\d+).*\)#) # Note some years have a /II after them?
125   {
126      $title = $1;
127      $year = $2;
128   }
129   elsif ($title =~ m#(.+) \(\?\?\?\?\)#)
130   {
131      $title = $1;
132   }
133
134   # parse director
135   my $data = parseBetween($response, ">Director:</h5>", "</div>");
136   if (!length($data)) {
137      $data = parseBetween($response, ">Directors:</h5>", "</div>");
138   }
139   my $director = join(",", ($data =~ m/$name_link_pat/g));
140
141   # parse writer
142   # (Note: this takes the 'first' writer, may want to include others)
143   $data = parseBetween($response, ">Writers <a href=\"/wga\">(WGA)</a>:</h5>", "</div>");
144   if (!length($data)) {
145         $data = parseBetween($response, ">Writer:</h5>", "</div>");
146   }
147   if (!length($data)) {
148         $data = parseBetween($response, ">Writers:</h5>", "</div>");
149   }
150   my $writer = join(",", ($data =~ m/$name_link_pat/g));
151
152   # parse plot
153   my $plot = parseBetween($response, ">Plot Outline:</h5> ", "</div>");
154   if (!$plot) {
155      $plot = parseBetween($response, ">Plot Summary:</h5> ", "</div>");
156   }
157
158   if ($plot) {
159      # replace name links in plot (example 0388795)
160      $plot =~ s/$name_link_pat/$1/g;
161
162      # replace title links
163      my $title_link_pat = qr!<a href="/title/[^"]*">([^<]*)</a>!m;
164      $plot =~ s/$title_link_pat/$1/g;
165
166      # plot ends at first remaining link
167      my $plot_end = index($plot, "<a ");
168      if ($plot_end != -1) {
169         $plot = substr($plot, 0, $plot_end);
170      }
171      $plot = trim($plot);
172   }
173
174   # parse user rating
175   my $userrating = parseBetween($response, ">User Rating:</b>", "> (");
176   $userrating = parseBetween($userrating, "<b>", "/");
177
178   # parse MPAA rating
179   my $ratingcountry = "USA";
180   my $movierating = trim(parseBetween($response, ">MPAA</a>:</h5>", "</div>"));
181   if (!$movierating) {
182       $movierating = parseBetween($response, ">Certification:</h5>", "</div>");
183       $movierating = parseBetween($movierating, "certificates=$ratingcountry",
184                                   "/a>");
185       $movierating = parseBetween($movierating, ">", "<");
186   }
187
188   # parse movie length
189   my $runtime = trim(parseBetween($response, ">Runtime:</h5>", " min"));
190   unless ($runtime =~ /^-?\d/) {
191      $runtime = trim(parseBetween($response, "USA:", " min"));
192   }
193
194   # parse cast
195   #  Note: full cast would be from url:
196   #    www.imdb.com/title/<movieid>/fullcredits
197   my $cast = "";
198   $data = parseBetween($response, "Cast overview, first billed only",
199                               "/table>");
200   if ($data) {
201      $cast = join(',', ($data =~ m/$name_link_pat/g));
202   }
203   
204   
205   # parse genres
206   my $lgenres = "";
207   $data = parseBetween($response, "<h5>Genre:</h5>","</div>");
208   if ($data) {
209      my $genre_pat = qr'/Sections/Genres/(?:[a-z ]+/)*">([^<]+)<'im;
210      $lgenres = join(',', ($data =~ /$genre_pat/g));
211   }
212   
213   # parse countries
214   $data = parseBetween($response, "Country:</h5>","</div>");
215   my $country_pat = qr'/Sections/Countries/[A-Z]+/">([^<]+)</a>'i;
216   my $lcountries = join(",", ($data =~ m/$country_pat/g));
217
218   # output fields (these field names must match what MythVideo is looking for)
219   print "Title:$title\n";
220   print "Year:$year\n";
221   print "Director:$director\n";
222   print "Plot:$plot\n";
223   print "UserRating:$userrating\n";
224   print "MovieRating:$movierating\n";
225   print "Runtime:$runtime\n";
226   print "Writers: $writer\n";
227   print "Cast: $cast\n";
228   print "Genres: $lgenres\n";
229   print "Countries: $lcountries\n";
230}
231
232# dump Movie Poster
233sub getMoviePoster {
234   my ($movieid)=@_; # grab movieid parameter
235   if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
236
237   # get the search results  page
238   my $request = "http://www.imdb.com/title/tt" . $movieid . "/posters";
239   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
240   my $response = get $request;
241   if (defined $opt_r) { printf("%s", $response); }
242
243   if (!defined $response) {return;}
244
245   my $uri = "";
246
247   # look for references to impawards.com posters - they are high quality
248   my $site = "http://www.impawards.com";
249   my $impsite = parseBetween($response, "<a href=\"".$site, "\">".$site);
250
251   # jersey girl fix
252   $impsite = parseBetween($response, "<a href=\"http://impawards.com","\">http://impawards.com") if ($impsite eq "");
253
254   if ($impsite) {
255      $impsite = $site . $impsite;
256
257      if (defined $opt_d) { print "# Searching for poster at: ".$impsite."\n"; }
258      my $impres = get $impsite;
259      if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); }
260      if (defined $opt_r) { printf("%s", $impres); }     
261
262      # making sure it isnt redirect
263      $uri = parseBetween($impres, "0;URL=..", "\">");
264      if ($uri ne "") {
265         if (defined $opt_d) { printf("# processing redirect to %s\n",$uri); }
266         # this was redirect
267         $impsite = $site . $uri;
268         $impres = get $impsite;
269      }
270     
271      # do stuff normally
272      $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT");
273      # uri here is relative... patch it up to make a valid uri
274      if (!($uri =~ /http:(.*)/ )) {
275         my $path = substr($impsite, 0, rindex($impsite, '/') + 1);
276         $uri = $path."posters/".$uri;
277      }
278      if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; }
279   }
280
281   # try looking on nexbase
282   if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)nexbase/i) {
283      if ($1 ne "") {
284         if (defined $opt_d) { print "# found nexbase poster page: $1 \n"; }
285         my $cinres = get $1;
286         if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
287         if (defined $opt_r) { printf("%s", $cinres); }
288
289         if ($cinres =~ m/<a id="photo_url" href="([^"]*?)" ><\/a>/i) {
290            if (defined $opt_d) { print "# nexbase url retreived\n"; }
291            $uri = $1;
292         }
293      }
294   }
295
296   # try looking on cinemablend
297   if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)cinemablend/i) {
298      if ($1 ne "") {
299         if (defined $opt_d) { print "# found cinemablend poster page: $1 \n"; }
300         my $cinres = get $1;
301         if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
302         if (defined $opt_r) { printf("%s", $cinres); }
303
304         if ($cinres =~ m/<td align=center><img src="([^"]*?)" border=1><\/td>/i) {
305            if (defined $opt_d) { print "# cinemablend url retreived\n"; }
306            $uri = "http://www.cinemablend.com/".$1;   
307         }
308      }
309   }
310
311   # if the impawards site attempt didn't give a filename grab it from imdb
312   if ($uri eq "") {
313       if (defined $opt_d) { print "# looking for imdb posters\n"; }
314       my $host = "http://posters.imdb.com/posters/";
315
316       $uri = parseBetween($response, $host, "\"><td><td><a href=\"");
317       if ($uri ne "") {
318           $uri = $host.$uri;
319       } else {
320          if (defined $opt_d) { print "# no poster found\n"; }
321       }
322   }
323
324
325
326   my @movie_titles;
327   my $found_low_res = 0;
328   my $k = 0;
329   
330   # no poster found, take lowres image from imdb
331   if ($uri eq "") {
332      if (defined $opt_d) { print "# looking for lowres imdb posters\n"; }
333      my $host = "http://www.imdb.com/title/tt" . $movieid . "/";
334      $response = get $host;
335
336      # Better handling for low resolution posters
337      #
338      if ($response =~ m/<a name="poster".*<img.*src="([^"]*).*<\/a>/ig) {
339         if (defined $opt_d) { print "# found low res poster at: $1\n"; }
340         $uri = $1;
341         $found_low_res = 1;
342      } else {
343         if (defined $opt_d) { print "# no low res poster found\n"; }
344         $uri = "";
345      }
346
347      if (defined $opt_d) { print "# starting to look for movie title\n"; }
348     
349      # get main title
350      if (defined $opt_d) { print "# Getting possible movie titles:\n"; }
351      $movie_titles[$k++] = parseBetween($response, "<title>", "<\/title>");
352      if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
353
354      # now we get all other possible movie titles and store them in the titles array
355      while($response =~ m/>([^>^\(]*)([ ]{0,1}\([^\)]*\)[^\(^\)]*[ ]{0,1}){0,1}\(informal title\)/g) {
356         $movie_titles[$k++] = trim($1);
357         if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
358      }
359       
360   }
361   
362   print "$uri\n";
363}
364
365# dump Movie list:  1 entry per line, each line as 'movieid:Movie Title'
366sub getMovieList {
367   my ($filename, $options)=@_; # grab parameters
368
369   # If we wanted to inspect the file for any reason we can do that now
370
371   #
372   # Convert filename into a query string
373   # (use same rules that Metadata::guesTitle does)
374   my $query = $filename;
375   $query = uri_unescape($query);  # in case it was escaped
376   # Strip off the file extension
377   if (rindex($query, '.') != -1) {
378      $query = substr($query, 0, rindex($query, '.'));
379   }
380   # Strip off anything following '(' - people use this for general comments
381   if (rindex($query, '(') != -1) {
382      $query = substr($query, 0, rindex($query, '('));
383   }
384   # Strip off anything following '[' - people use this for general comments
385   if (rindex($query, '[') != -1) {
386      $query = substr($query, 0, rindex($query, '['));
387   }
388
389   # IMDB searches do better if any trailing ,The is left off
390   $query =~ /(.*), The$/i;
391   if ($1) { $query = $1; }
392   
393   # prepare the url
394   $query = uri_escape($query);
395   if (!$options) { $options = "" ;}
396   if (defined $opt_d) {
397      printf("# query: '%s', options: '%s'\n", $query, $options);
398   }
399   
400   # get the search results  page
401   #    some known IMDB options are: 
402   #         type=[fuzy]         looser search
403   #         from_year=[int]     limit matches to year (broken at imdb)
404   #         to_year=[int]       limit matches to year (broken at imdb)
405   #         sort=[smart]        ??
406   #         tv=[no|both|only]   limits between tv and movies (broken at imdb)
407   #$options = "tt=on;nm=on;mx=20";  # not exactly clear what these options do
408   my $request = "http://www.imdb.com/find?q=$query;$options";
409   if (defined $opt_d) { printf("# request: '%s'\n", $request); }
410   my $response = get $request;
411   if (defined $opt_r) {
412      print $response;
413      exit(0);
414   }
415   
416   # check to see if we got a results page or a movie page
417   #    looking for 'add=<movieid>" target=' which only exists
418   #    in a movie description page
419   my $movienum = parseBetween($response, "add=", "\">");
420   if ($movienum) {
421       if (defined $opt_d) { printf("# redirected to movie page\n"); }
422       my $movietitle = parseBetween($response, "<title>", "</title>");
423       $movietitle =~ m#(.+) \((\d+)\)#;
424       $movietitle = $1;
425       print "$movienum:$movietitle\n";
426       exit(0);
427   }
428
429   # extract possible matches
430   #    possible matches are grouped in several catagories: 
431   #        exact, partial, and approximate
432   my $popular_results = parseBetween($response, "<b>Popular Titles</b>",
433                                              "</p>");
434   my $exact_matches = parseBetween($response, "<b>Titles (Exact Matches)</b>",
435                                              "</p>");
436   my $partial_matches = parseBetween($response, "<b>Titles (Partial Matches)</b>",
437                                              "</p>");
438#   my $approx_matches = parseBetween($response, "<b>Approximate Matches</b>",
439#                                               "</ol>");
440   # parse movie list from matches
441   my $beg = "<td";
442   my $end = "</td";
443   my $count = 0;
444   my @movies;
445
446#   my $data = $exact_matches.$partial_matches;
447   my $data = $popular_results.$exact_matches;
448   # resort to partial matches if no exact
449   if ($data eq "") { $data = $partial_matches; }
450   # resort to approximate matches if no exact or partial
451#   if ($data eq "") { $data = $approx_matches; }
452   if ($data eq "") {
453      if (defined $opt_d) { printf("# no results\n"); }
454      return;
455   }
456   my $start = index($data, $beg);
457   my $finish = index($data, $end, $start);
458   my $year;
459   my $type;
460   my $title;
461   while ($start != -1 && $start < length($data)) {
462      $start += length($beg);
463      my $entry = substr($data, $start, $finish - $start);
464      $start = index($data, $beg, $finish + 1);
465      $finish = index($data, $end, $start);
466
467      my $title = "";
468      my $year = "";
469      my $type = "";
470      my $movienum = "";
471
472      my $link_end = "</a>";
473      $fl_end = index($entry, $link_end);
474      $fl_end += length($link_end);
475      my $lhs = substr($entry, 0, $fl_end);
476      my $rhs = substr($entry, $fl_end);
477
478      if ($lhs =~ m/<a href="\/title\/tt(\d+)\/.*\">(.+)<\/a>/i) {
479          $movienum = $1;
480          $title = $2;
481      } else {
482           if (defined $opt_d) {
483               print("Unrecognized entry format\n");
484           }
485           next;
486      }
487
488      if ($rhs =~ m/\((\d+)\) \((.+)\)/) {
489          $year = $1;
490          $type = $2;
491      } elsif ($rhs =~ m/\((\d+)\)/) {
492          $year = $1;
493      }
494
495      my $skip = 0;
496
497      # fix broken 'tv=no' option
498      if ($options =~ /tv=no/) {
499         if ($type eq "TV") {
500            if (defined $opt_d) {printf("# skipping TV program: %s\n", $title);}
501            $skip = 1;
502         }
503      }
504      if ($options =~ /tv=only/) {
505         if ($type eq "") {
506            if (defined $opt_d) {printf("# skipping Movie: %s\n", $title);}
507            $skip = 1;
508         }
509      }
510      # fix broken 'from_year=' option
511      if ($options =~ /from_year=(\d+)/) {
512         if ($year < $1) {
513            if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
514            $skip = 1;
515         }
516      }
517      # fix broken 'to_year=' option
518      if ($options =~ /to_year=(\d+)/) {
519         if ($year > $1) {
520            if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
521            $skip = 1;
522         }
523      }
524
525      # option to strip out videos (I think that's what '(V)' means anyway?)
526      if ($options =~ /video=no/) {
527         if ($type eq "V") {
528            if (defined $opt_d) {
529                printf("# skipping Video program: %s\n", $title);
530            }
531            $skip = 1;
532         }
533      }
534   
535      # (always) strip out video game's (why does IMDB give these anyway?)
536      if ($type eq "VG") {
537         if (defined $opt_d) {printf("# skipping videogame: %s\n", $title);}
538         $skip = 1;
539      }
540
541      # add to array
542      if (!$skip) {
543          my $moviename = $title;
544          if ($year ne "") {
545              $moviename .= " ($year)";
546          }
547
548#         $movies[$count++] = $movienum . ":" . $title;
549         $movies[$count++] = $movienum . ":" . $moviename;
550      }
551   }
552
553   # display array of values
554   for $movie (@movies) { print "$movie\n"; }
555}
556
557#
558# Main Program
559#
560
561# parse command line arguments
562getopts('ohrdivDMP');
563
564# print out info
565if (defined $opt_v) { version(); exit 1; }
566if (defined $opt_i) { info(); exit 1; }
567
568# print out usage if needed
569if (defined $opt_h || $#ARGV<0) { help(); }
570
571if (defined $opt_D) {
572   # take movieid from cmdline arg
573   $movieid = shift || die "Usage : $0 -D <movieid>\n";
574   getMovieData($movieid);
575}
576
577elsif (defined $opt_P) {
578   # take movieid from cmdline arg
579   $movieid = shift || die "Usage : $0 -P <movieid>\n";
580   getMoviePoster($movieid);
581}
582
583elsif (defined $opt_M) {
584   # take query from cmdline arg
585   $options = shift || die "Usage : $0 -M [options] <query>\n";
586   $query = shift;
587   if (!$query) {
588      $query = $options;
589      $options = "";
590   }
591   getMovieList($query, $options);
592}
593# vim: set expandtab ts=3 sw=3 :