vcs.maemo.org Git - pkg-perl/blob - deb-src/libwww-mechanize-perl/libwww-mechanize-perl-1.34/lib/WWW/Mechanize.pm

   1 package WWW::Mechanize;
   2
   3 =head1 NAME
   4
   5 WWW::Mechanize - Handy web browsing in a Perl object
   6
   7 =head1 VERSION
   8
   9 Version 1.34
  10
  11 =cut
  12
  13 our $VERSION = '1.34';
  14
  15 =head1 SYNOPSIS
  16
  17 C<WWW::Mechanize>, or Mech for short, helps you automate interaction with
  18 a website. It supports performing a sequence of page fetches including
  19 following links and submitting forms. Each fetched page is parsed and
  20 its links and forms are extracted. A link or a form can be selected, form
  21 fields can be filled and the next page can be fetched. Mech also stores
  22 a history of the URLs you've visited, which can be queried and revisited.
  23
  24     use WWW::Mechanize;
  25     my $mech = WWW::Mechanize->new();
  26
  27     $mech->get( $url );
  28
  29     $mech->follow_link( n => 3 );
  30     $mech->follow_link( text_regex => qr/download this/i );
  31     $mech->follow_link( url => 'http://host.com/index.html' );
  32
  33     $mech->submit_form(
  34         form_number => 3,
  35         fields      => {
  36             username    => 'mungo',
  37             password    => 'lost-and-alone',
  38         }
  39     );
  40
  41     $mech->submit_form(
  42         form_name => 'search',
  43         fields    => { query  => 'pot of gold', },
  44         button    => 'Search Now'
  45     );
  46
  47
  48 Mech is well suited for use in testing web applications.  If you use
  49 one of the Test::*, like L<Test::HTML::Lint> modules, you can check the
  50 fetched content and use that as input to a test call.
  51
  52     use Test::More;
  53     like( $mech->content(), qr/$expected/, "Got expected content" );
  54
  55 Each page fetch stores its URL in a history stack which you can
  56 traverse.
  57
  58     $mech->back();
  59
  60 If you want finer control over your page fetching, you can use
  61 these methods. C<follow_link> and C<submit_form> are just high
  62 level wrappers around them.
  63
  64     $mech->find_link( n => $number );
  65     $mech->form_number( $number );
  66     $mech->form_name( $name );
  67     $mech->field( $name, $value );
  68     $mech->set_fields( %field_values );
  69     $mech->set_visible( @criteria );
  70     $mech->click( $button );
  71
  72 L<WWW::Mechanize> is a proper subclass of L<LWP::UserAgent> and
  73 you can also use any of L<LWP::UserAgent>'s methods.
  74
  75     $mech->add_header($name => $value);
  76
  77 Please note that Mech does NOT support JavaScript.  Please check the
  78 FAQ in WWW::Mechanize::FAQ for more.
  79
  80 =head1 IMPORTANT LINKS
  81
  82 =over 4
  83
  84 =item * L<http://code.google.com/p/www-mechanize/issues/list>
  85
  86 The queue for bugs & enhancements in WWW::Mechanize and
  87 Test::WWW::Mechanize.  Please note that the queue at L<http://rt.cpan.org>
  88 is no longer maintained.
  89
  90 =item * L<http://search.cpan.org/dist/WWW-Mechanize/>
  91
  92 The CPAN documentation page for Mechanize.
  93
  94 =item * L<http://search.cpan.org/dist/WWW-Mechanize/lib/WWW/Mechanize/FAQ.pod>
  95
  96 Frequently asked questions.  Make sure you read here FIRST.
  97
  98 =back
  99
 100 =cut
 101
 102 use strict;
 103 use warnings;
 104
 105 use HTTP::Request 1.30;
 106 use LWP::UserAgent 2.003;
 107 use HTML::Form 1.00;
 108 use HTML::TokeParser;
 109
 110 use base 'LWP::UserAgent';
 111
 112 our $HAS_ZLIB;
 113 BEGIN {
 114     $HAS_ZLIB = eval 'use Compress::Zlib (); 1;';
 115 }
 116
 117 =head1 CONSTRUCTOR AND STARTUP
 118
 119 =head2 new()
 120
 121 Creates and returns a new WWW::Mechanize object, hereafter referred to as
 122 the "agent".
 123
 124     my $mech = WWW::Mechanize->new()
 125
 126 The constructor for WWW::Mechanize overrides two of the parms to the
 127 LWP::UserAgent constructor:
 128
 129     agent => 'WWW-Mechanize/#.##'
 130     cookie_jar => {}    # an empty, memory-only HTTP::Cookies object
 131
 132 You can override these overrides by passing parms to the constructor,
 133 as in:
 134
 135     my $mech = WWW::Mechanize->new( agent => 'wonderbot 1.01' );
 136
 137 If you want none of the overhead of a cookie jar, or don't want your
 138 bot accepting cookies, you have to explicitly disallow it, like so:
 139
 140     my $mech = WWW::Mechanize->new( cookie_jar => undef );
 141
 142 Here are the parms that WWW::Mechanize recognizes.  These do not include
 143 parms that L<LWP::UserAgent> recognizes.
 144
 145 =over 4
 146
 147 =item * C<< autocheck => [0|1] >>
 148
 149 Checks each request made to see if it was successful.  This saves you
 150 the trouble of manually checking yourself.  Any errors found are errors,
 151 not warnings.  Default is off.
 152
 153 =item * C<< onwarn => \&func >>
 154
 155 Reference to a C<warn>-compatible function, such as C<< L<Carp>::carp >>,
 156 that is called when a warning needs to be shown.
 157
 158 If this is set to C<undef>, no warnings will ever be shown.  However,
 159 it's probably better to use the C<quiet> method to control that behavior.
 160
 161 If this value is not passed, Mech uses C<Carp::carp> if L<Carp> is
 162 installed, or C<CORE::warn> if not.
 163
 164 =item * C<< onerror => \&func >>
 165
 166 Reference to a C<die>-compatible function, such as C<< L<Carp>::croak >>,
 167 that is called when there's a fatal error.
 168
 169 If this is set to C<undef>, no errors will ever be shown.
 170
 171 If this value is not passed, Mech uses C<Carp::croak> if L<Carp> is
 172 installed, or C<CORE::die> if not.
 173
 174 =item * C<< quiet => [0|1] >>
 175
 176 Don't complain on warnings.  Setting C<< quiet => 1 >> is the same as
 177 calling C<< $mech->quiet(1) >>.  Default is off.
 178
 179 =item * C<< stack_depth => $value >>
 180
 181 Sets the depth of the page stack that keeps track of all the downloaded
 182 pages. Default is 0 (infinite). If the stack is eating up your memory,
 183 then set it to 1.
 184
 185 =back
 186
 187 =cut
 188
 189 sub new {
 190     my $class = shift;
 191
 192     my %parent_parms = (
 193         agent       => "WWW-Mechanize/$VERSION",
 194         cookie_jar  => {},
 195     );
 196
 197     my %mech_parms = (
 198         autocheck   => 0,
 199         onwarn      => \&WWW::Mechanize::_warn,
 200         onerror     => \&WWW::Mechanize::_die,
 201         quiet       => 0,
 202         stack_depth => 8675309,     # Arbitrarily humongous stack
 203         headers     => {},
 204     );
 205
 206     my %passed_parms = @_;
 207
 208     # Keep the mech-specific parms before creating the object.
 209     while ( my($key,$value) = each %passed_parms ) {
 210         if ( exists $mech_parms{$key} ) {
 211             $mech_parms{$key} = $value;
 212         }
 213         else {
 214             $parent_parms{$key} = $value;
 215         }
 216     }
 217
 218     my $self = $class->SUPER::new( %parent_parms );
 219     bless $self, $class;
 220
 221     # Use the mech parms now that we have a mech object.
 222     for my $parm ( keys %mech_parms ) {
 223         $self->{$parm} = $mech_parms{$parm};
 224     }
 225     $self->{page_stack} = [];
 226     $self->env_proxy();
 227
 228     # libwww-perl 5.800 (and before, I assume) has a problem where
 229     # $ua->{proxy} can be undef and clone() doesn't handle it.
 230     $self->{proxy} = {} unless defined $self->{proxy};
 231     push( @{$self->requests_redirectable}, 'POST' );
 232
 233     $self->_reset_page;
 234
 235     return $self;
 236 }
 237
 238 =head2 $mech->agent_alias( $alias )
 239
 240 Sets the user agent string to the expanded version from a table of actual user strings.
 241 I<$alias> can be one of the following:
 242
 243 =over 4
 244
 245 =item * Windows IE 6
 246
 247 =item * Windows Mozilla
 248
 249 =item * Mac Safari
 250
 251 =item * Mac Mozilla
 252
 253 =item * Linux Mozilla
 254
 255 =item * Linux Konqueror
 256
 257 =back
 258
 259 then it will be replaced with a more interesting one.  For instance,
 260
 261     $mech->agent_alias( 'Windows IE 6' );
 262
 263 sets your User-Agent to
 264
 265     Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
 266
 267 The list of valid aliases can be returned from C<known_agent_aliases()>.  The current list is:
 268
 269 =over
 270
 271 =item * Windows IE 6
 272
 273 =item * Windows Mozilla
 274
 275 =item * Mac Safari
 276
 277 =item * Mac Mozilla
 278
 279 =item * Linux Mozilla
 280
 281 =item * Linux Konqueror
 282
 283 =back
 284
 285 =cut
 286
 287 my %known_agents = (
 288     'Windows IE 6'      => 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
 289     'Windows Mozilla'   => 'Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US; rv:1.4b) Gecko/20030516 Mozilla Firebird/0.6',
 290     'Mac Safari'        => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X; en-us) AppleWebKit/85 (KHTML, like Gecko) Safari/85',
 291     'Mac Mozilla'       => 'Mozilla/5.0 (Macintosh; U; PPC Mac OS X Mach-O; en-US; rv:1.4a) Gecko/20030401',
 292     'Linux Mozilla'     => 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.4) Gecko/20030624',
 293     'Linux Konqueror'   => 'Mozilla/5.0 (compatible; Konqueror/3; Linux)',
 294 );
 295
 296 sub agent_alias {
 297     my $self = shift;
 298     my $alias = shift;
 299
 300     if ( defined $known_agents{$alias} ) {
 301         return $self->agent( $known_agents{$alias} );
 302     }
 303     else {
 304         $self->warn( qq{Unknown agent alias "$alias"} );
 305         return $self->agent();
 306     }
 307 }
 308
 309 =head2 known_agent_aliases()
 310
 311 Returns a list of all the agent aliases that Mech knows about.
 312
 313 =cut
 314
 315 sub known_agent_aliases {
 316     return sort keys %known_agents;
 317 }
 318
 319 =head1 PAGE-FETCHING METHODS
 320
 321 =head2 $mech->get( $uri )
 322
 323 Given a URL/URI, fetches it.  Returns an L<HTTP::Response> object.
 324 I<$uri> can be a well-formed URL string, a L<URI> object, or a
 325 L<WWW::Mechanize::Link> object.
 326
 327 The results are stored internally in the agent object, but you don't
 328 know that.  Just use the accessors listed below.  Poking at the
 329 internals is deprecated and subject to change in the future.
 330
 331 C<get()> is a well-behaved overloaded version of the method in
 332 L<LWP::UserAgent>.  This lets you do things like
 333
 334     $mech->get( $uri, ':content_file' => $tempfile );
 335
 336 and you can rest assured that the parms will get filtered down
 337 appropriately.
 338
 339 B<NOTE:> Because C<:content_file> causes the page contents to be
 340 stored in a file instead of the response object, some Mech functions
 341 that expect it to be there won't work as expected. Use with caution.
 342
 343 =cut
 344
 345 sub get {
 346     my $self = shift;
 347     my $uri = shift;
 348
 349     $uri = $uri->url if ref($uri) eq 'WWW::Mechanize::Link';
 350
 351     $uri = $self->base
 352             ? URI->new_abs( $uri, $self->base )
 353             : URI->new( $uri );
 354
 355     # It appears we are returning a super-class method,
 356     # but it in turn calls the request() method here in Mechanize
 357     return $self->SUPER::get( $uri->as_string, @_ );
 358 }
 359
 360 =head2 $mech->put( $uri, content => $content )
 361
 362 PUTs I<$content> to $uri.  Returns an L<HTTP::Response> object.
 363 I<$uri> can be a well-formed URI string, a L<URI> object, or a
 364 L<WWW::Mechanize::Link> object.
 365
 366 =cut
 367
 368 sub put {
 369     my $self = shift;
 370     my $uri = shift;
 371
 372     $uri = $uri->url if ref($uri) eq 'WWW::Mechanize::Link';
 373
 374     $uri = $self->base
 375             ? URI->new_abs( $uri, $self->base )
 376             : URI->new( $uri );
 377
 378     # It appears we are returning a super-class method,
 379     # but it in turn calls the request() method here in Mechanize
 380     return $self->_SUPER_put( $uri->as_string, @_ );
 381 }
 382
 383
 384 # Added until LWP::UserAgent has it.
 385 sub _SUPER_put {
 386     require HTTP::Request::Common;
 387     my($self, @parameters) = @_;
 388     my @suff = $self->_process_colonic_headers(\@parameters,1);
 389     return $self->request( HTTP::Request::Common::PUT( @parameters ), @suff );
 390 }
 391
 392 =head2 $mech->reload()
 393
 394 Acts like the reload button in a browser: repeats the current
 395 request. The history (as per the L<back> method) is not altered.
 396
 397 Returns the L<HTTP::Response> object from the reload, or C<undef>
 398 if there's no current request.
 399
 400 =cut
 401
 402 sub reload {
 403     my $self = shift;
 404
 405     return unless my $req = $self->{req};
 406
 407     return $self->_update_page( $req, $self->_make_request( $req, @_ ) );
 408 }
 409
 410 =head2 $mech->back()
 411
 412 The equivalent of hitting the "back" button in a browser.  Returns to
 413 the previous page.  Won't go back past the first page. (Really, what
 414 would it do if it could?)
 415
 416 =cut
 417
 418 sub back {
 419     my $self = shift;
 420     $self->_pop_page_stack;
 421 }
 422
 423 =head1 STATUS METHODS
 424
 425 =head2 $mech->success()
 426
 427 Returns a boolean telling whether the last request was successful.
 428 If there hasn't been an operation yet, returns false.
 429
 430 This is a convenience function that wraps C<< $mech->res->is_success >>.
 431
 432 =cut
 433
 434 sub success {
 435     my $self = shift;
 436
 437     return $self->res && $self->res->is_success;
 438 }
 439
 440
 441 =head2 $mech->uri()
 442
 443 Returns the current URI as a L<URI> object. This object stringifies
 444 to the URI itself.
 445
 446 =head2 $mech->response() / $mech->res()
 447
 448 Return the current response as an L<HTTP::Response> object.
 449
 450 Synonym for C<< $mech->response() >>
 451
 452 =head2 $mech->status()
 453
 454 Returns the HTTP status code of the response.
 455
 456 =head2 $mech->ct()
 457
 458 Returns the content type of the response.
 459
 460 =head2 $mech->base()
 461
 462 Returns the base URI for the current response
 463
 464 =head2 $mech->forms()
 465
 466 When called in a list context, returns a list of the forms found in
 467 the last fetched page. In a scalar context, returns a reference to
 468 an array with those forms. The forms returned are all L<HTML::Form>
 469 objects.
 470
 471 =head2 $mech->current_form()
 472
 473 Returns the current form as an L<HTML::Form> object.
 474
 475 =head2 $mech->links()
 476
 477 When called in a list context, returns a list of the links found in the
 478 last fetched page.  In a scalar context it returns a reference to an array
 479 with those links.  Each link is a L<WWW::Mechanize::Link> object.
 480
 481 =head2 $mech->is_html()
 482
 483 Returns true/false on whether our content is HTML, according to the
 484 HTTP headers.
 485
 486 =cut
 487
 488 sub uri {
 489     my $self = shift;
 490     return $self->response->request->uri;
 491 }
 492
 493 sub res {           my $self = shift; return $self->{res}; }
 494 sub response {      my $self = shift; return $self->{res}; }
 495 sub status {        my $self = shift; return $self->{status}; }
 496 sub ct {            my $self = shift; return $self->{ct}; }
 497 sub base {          my $self = shift; return $self->{base}; }
 498 sub current_form {  my $self = shift; return $self->{form}; }
 499 sub is_html {       my $self = shift; return defined $self->{ct} && ($self->{ct} eq 'text/html'); }
 500
 501 =head2 $mech->title()
 502
 503 Returns the contents of the C<< <TITLE> >> tag, as parsed by
 504 L<HTML::HeadParser>.  Returns undef if the content is not HTML.
 505
 506 =cut
 507
 508 sub title {
 509     my $self = shift;
 510     return unless $self->is_html;
 511
 512     require HTML::HeadParser;
 513     my $p = HTML::HeadParser->new;
 514     $p->parse($self->content);
 515     return $p->header('Title');
 516 }
 517
 518 =head1 CONTENT-HANDLING METHODS
 519
 520 =head2 $mech->content(...)
 521
 522 Returns the content that the mech uses internally for the last page
 523 fetched. Ordinarily this is the same as $mech->response()->content(),
 524 but this may differ for HTML documents if L</update_html> is
 525 overloaded (in which case the value passed to the base-class
 526 implementation of same will be returned), and/or extra named arguments
 527 are passed to I<content()>:
 528
 529 =over 2
 530
 531 =item I<< $mech->content( format => 'text' ) >>
 532
 533 Returns a text-only version of the page, with all HTML markup
 534 stripped. This feature requires I<HTML::TreeBuilder> to be installed,
 535 or a fatal error will be thrown.
 536
 537 =item I<< $mech->content( base_href => [$base_href|undef] ) >>
 538
 539 Returns the HTML document, modified to contain a
 540 C<< <base href="$base_href"> >> mark-up in the header.
 541 I<$base_href> is C<< $mech->base() >> if not specified. This is
 542 handy to pass the HTML to e.g. L<HTML::Display>.
 543
 544 =back
 545
 546 Passing arguments to C<content()> if the current document is not
 547 HTML has no effect now (i.e. the return value is the same as
 548 C<< $self->response()->content() >>. This may change in the future,
 549 but will likely be backwards-compatible when it does.
 550
 551 =cut
 552
 553 sub content {
 554     my $self = shift;
 555     my $content = $self->{content};
 556
 557     if ( $self->is_html ) {
 558         my %parms = @_;
 559         if ( exists $parms{base_href} ) {
 560             my $arg = (delete $parms{base_href}) || $self->base;
 561             $content=~s/<head>/<head>\n<base href="$arg">/i;
 562         }
 563         if ( my $arg = delete $parms{format} ) {
 564             if ($arg eq 'text') {
 565                 require HTML::TreeBuilder;
 566                 my $tree = HTML::TreeBuilder->new();
 567                 $tree->parse($content);
 568                 $tree->eof();
 569                 $tree->elementify(); # just for safety
 570                 $content = $tree->as_text();
 571                 $tree->delete;
 572             }
 573             else {
 574                 $self->die( qq{Unknown "format" parameter "$arg"} );
 575             }
 576         }
 577         for my $cmd ( sort keys %parms ) {
 578             $self->die( qq{Unknown named argument "$cmd"} );
 579         }
 580     } # is HTML
 581
 582     return $content;
 583 }
 584
 585 =head1 LINK METHODS
 586
 587 =head2 $mech->links
 588
 589 Lists all the links on the current page.  Each link is a
 590 WWW::Mechanize::Link object. In list context, returns a list of all
 591 links.  In scalar context, returns an array reference of all links.
 592
 593 =cut
 594
 595 sub links {
 596     my $self = shift ;
 597
 598     $self->_extract_links() unless $self->{_extracted_links};
 599
 600     return @{$self->{links}} if wantarray;
 601     return $self->{links};
 602 }
 603
 604 =head2 $mech->follow_link(...)
 605
 606 Follows a specified link on the page.  You specify the match to be
 607 found using the same parms that C<L<find_link()>> uses.
 608
 609 Here some examples:
 610
 611 =over 4
 612
 613 =item * 3rd link called "download"
 614
 615     $mech->follow_link( text => 'download', n => 3 );
 616
 617 =item * first link where the URL has "download" in it, regardless of case:
 618
 619     $mech->follow_link( url_regex => qr/download/i );
 620
 621 or
 622
 623     $mech->follow_link( url_regex => qr/(?i:download)/ );
 624
 625 =item * 3rd link on the page
 626
 627     $mech->follow_link( n => 3 );
 628
 629 =back
 630
 631 Returns the result of the GET method (an HTTP::Response object) if
 632 a link was found. If the page has no links, or the specified link
 633 couldn't be found, returns undef.
 634
 635 =cut
 636
 637 sub follow_link {
 638     my $self = shift;
 639     my %parms = ( n=>1, @_ );
 640
 641     if ( $parms{n} eq 'all' ) {
 642         delete $parms{n};
 643         $self->warn( q{follow_link(n=>"all") is not valid} );
 644     }
 645
 646     my $link = $self->find_link(%parms);
 647     return $self->get( $link->url ) if $link;
 648     return;
 649 }
 650
 651 =head2 $mech->find_link( ... )
 652
 653 Finds a link in the currently fetched page. It returns a
 654 L<WWW::Mechanize::Link> object which describes the link.  (You'll
 655 probably be most interested in the C<url()> property.)  If it fails
 656 to find a link it returns undef.
 657
 658 You can take the URL part and pass it to the C<get()> method.  If
 659 that's your plan, you might as well use the C<follow_link()> method
 660 directly, since it does the C<get()> for you automatically.
 661
 662 Note that C<< <FRAME SRC="..."> >> tags are parsed out of the the HTML
 663 and treated as links so this method works with them.
 664
 665 You can select which link to find by passing in one or more of these
 666 key/value pairs:
 667
 668 =over 4
 669
 670 =item * C<< text => 'string', >> and C<< text_regex => qr/regex/, >>
 671
 672 C<text> matches the text of the link against I<string>, which must be an
 673 exact match.  To select a link with text that is exactly "download", use
 674
 675     $mech->find_link( text => 'download' );
 676
 677 C<text_regex> matches the text of the link against I<regex>.  To select a
 678 link with text that has "download" anywhere in it, regardless of case, use
 679
 680     $mech->find_link( text_regex => qr/download/i );
 681
 682 Note that the text extracted from the page's links are trimmed.  For
 683 example, C<< <a> foo </a> >> is stored as 'foo', and searching for
 684 leading or trailing spaces will fail.
 685
 686 =item * C<< url => 'string', >> and C<< url_regex => qr/regex/, >>
 687
 688 Matches the URL of the link against I<string> or I<regex>, as appropriate.
 689 The URL may be a relative URL, like F<foo/bar.html>, depending on how
 690 it's coded on the page.
 691
 692 =item * C<< url_abs => string >> and C<< url_abs_regex => regex >>
 693
 694 Matches the absolute URL of the link against I<string> or I<regex>,
 695 as appropriate.  The URL will be an absolute URL, even if it's relative
 696 in the page.
 697
 698 =item * C<< name => string >> and C<< name_regex => regex >>
 699
 700 Matches the name of the link against I<string> or I<regex>, as appropriate.
 701
 702 =item * C<< id => string >> and C<< id_regex => regex >>
 703
 704 Matches the attribute 'id' of the link against I<string> or
 705 I<regex>, as appropriate.
 706
 707 =item * C<< class => string >> and C<< class_regex => regex >>
 708
 709 Matches the attribute 'class' of the link against I<string> or
 710 I<regex>, as appropriate.
 711
 712 =item * C<< tag => string >> and C<< tag_regex => regex >>
 713
 714 Matches the tag that the link came from against I<string> or I<regex>,
 715 as appropriate.  The C<tag_regex> is probably most useful to check for
 716 more than one tag, as in:
 717
 718     $mech->find_link( tag_regex => qr/^(a|frame)$/ );
 719
 720 The tags and attributes looked at are defined below, at
 721 L<< $mech->find_link() : link format >>.
 722
 723 =back
 724
 725 If C<n> is not specified, it defaults to 1.  Therefore, if you don't
 726 specify any parms, this method defaults to finding the first link on the
 727 page.
 728
 729 Note that you can specify multiple text or URL parameters, which
 730 will be ANDed together.  For example, to find the first link with
 731 text of "News" and with "cnn.com" in the URL, use:
 732
 733     $mech->find_link( text => 'News', url_regex => qr/cnn\.com/ );
 734
 735 The return value is a reference to an array containing a
 736 L<WWW::Mechanize::Link> object for every link in C<< $self->content >>.
 737
 738 The links come from the following:
 739
 740 =over 4
 741
 742 =item C<< <A HREF=...> >>
 743
 744 =item C<< <AREA HREF=...> >>
 745
 746 =item C<< <FRAME SRC=...> >>
 747
 748 =item C<< <IFRAME SRC=...> >>
 749
 750 =item C<< <META CONTENT=...> >>
 751
 752 =back
 753
 754 =cut
 755
 756 sub find_link {
 757     my $self = shift;
 758     my %parms = ( n=>1, @_ );
 759
 760     my $wantall = ( $parms{n} eq 'all' );
 761
 762     $self->_clean_keys( \%parms, qr/^(n|(text|url|url_abs|name|tag|id|class)(_regex)?)$/ );
 763
 764     my @links = $self->links or return;
 765
 766     my $nmatches = 0;
 767     my @matches;
 768     for my $link ( @links ) {
 769         if ( _match_any_link_parms($link,\%parms) ) {
 770             if ( $wantall ) {
 771                 push( @matches, $link );
 772             }
 773             else {
 774                 ++$nmatches;
 775                 return $link if $nmatches >= $parms{n};
 776             }
 777         }
 778     } # for @links
 779
 780     if ( $wantall ) {
 781         return @matches if wantarray;
 782         return \@matches;
 783     }
 784
 785     return;
 786 } # find_link
 787
 788 # Used by find_links to check for matches
 789 # The logic is such that ALL parm criteria that are given must match
 790 sub _match_any_link_parms {
 791     my $link = shift;
 792     my $p = shift;
 793
 794     # No conditions, anything matches
 795     return 1 unless keys %$p;
 796
 797     return if defined $p->{url}           && !($link->url eq $p->{url} );
 798     return if defined $p->{url_regex}     && !($link->url =~ $p->{url_regex} );
 799     return if defined $p->{url_abs}       && !($link->url_abs eq $p->{url_abs} );
 800     return if defined $p->{url_abs_regex} && !($link->url_abs =~ $p->{url_abs_regex} );
 801     return if defined $p->{text}          && !(defined($link->text) && $link->text eq $p->{text} );
 802     return if defined $p->{text_regex}    && !(defined($link->text) && $link->text =~ $p->{text_regex} );
 803     return if defined $p->{name}          && !(defined($link->name) && $link->name eq $p->{name} );
 804     return if defined $p->{name_regex}    && !(defined($link->name) && $link->name =~ $p->{name_regex} );
 805     return if defined $p->{tag}           && !($link->tag && $link->tag eq $p->{tag} );
 806     return if defined $p->{tag_regex}     && !($link->tag && $link->tag =~ $p->{tag_regex} );
 807
 808     return if defined $p->{id}            && !($link->attrs->{id} && $link->attrs->{id} eq $p->{id} );
 809     return if defined $p->{id_regex}      && !($link->attrs->{id} && $link->attrs->{id} =~ $p->{id_regex} );
 810     return if defined $p->{class}         && !($link->attrs->{class} && $link->attrs->{class} eq $p->{class} );
 811     return if defined $p->{class_regex}   && !($link->attrs->{class} && $link->attrs->{class} =~ $p->{class_regex} );
 812
 813     # Success: everything that was defined passed.
 814     return 1;
 815
 816 }
 817
 818 # Cleans the %parms parameter for the find_link and find_image methods.
 819 sub _clean_keys {
 820     my $self = shift;
 821     my $parms = shift;
 822     my $rx_keyname = shift;
 823
 824     for my $key ( keys %$parms ) {
 825         my $val = $parms->{$key};
 826         if ( $key !~ qr/$rx_keyname/ ) {
 827             $self->warn( qq{Unknown link-finding parameter "$key"} );
 828             delete $parms->{$key};
 829             next;
 830         }
 831
 832         my $key_regex = ( $key =~ /_regex$/ );
 833         my $val_regex = ( ref($val) eq 'Regexp' );
 834
 835         if ( $key_regex ) {
 836             if ( !$val_regex ) {
 837                 $self->warn( qq{$val passed as $key is not a regex} );
 838                 delete $parms->{$key};
 839                 next;
 840             }
 841         }
 842         else {
 843             if ( $val_regex ) {
 844                 $self->warn( qq{$val passed as '$key' is a regex} );
 845                 delete $parms->{$key};
 846                 next;
 847             }
 848             if ( $val =~ /^\s|\s$/ ) {
 849                 $self->warn( qq{'$val' is space-padded and cannot succeed} );
 850                 delete $parms->{$key};
 851                 next;
 852             }
 853         }
 854     } # for keys %parms
 855 } # _clean_keys()
 856
 857
 858 =head2 $mech->find_all_links( ... )
 859
 860 Returns all the links on the current page that match the criteria.  The
 861 method for specifying link criteria is the same as in C<L<find_link()>>.
 862 Each of the links returned is a L<WWW::Mechanize::Link> object.
 863
 864 In list context, C<find_all_links()> returns a list of the links.
 865 Otherwise, it returns a reference to the list of links.
 866
 867 C<find_all_links()> with no parameters returns all links in the
 868 page.
 869
 870 =cut
 871
 872 sub find_all_links {
 873     my $self = shift;
 874     return $self->find_link( @_, n=>'all' );
 875 }
 876
 877 =head2 $mech->find_all_inputs( ... criteria ... )
 878
 879 find_all_inputs() returns an array of all the input controls in the
 880 current form whose properties match all of the regexes passed in.
 881 The controls returned are all descended from HTML::Form::Input.
 882
 883 If no criteria are passed, all inputs will be returned.
 884
 885 If there is no current page, there is no form on the current
 886 page, or there are no submit controls in the current form
 887 then the return will be an empty array.
 888
 889 You may use a regex or a literal string:
 890
 891     # get all textarea controls whose names begin with "customer"
 892     my @customer_text_inputs =
 893         $mech->find_all_inputs( {
 894             type       => 'textarea',
 895             name_regex => qr/^customer/,
 896         }
 897     );
 898
 899     # get all text or textarea controls called "customer"
 900     my @customer_text_inputs =
 901         $mech->find_all_inputs( {
 902             type_regex => qr/^(text|textarea)$/,
 903             name       => 'customer',
 904         }
 905     );
 906
 907 =cut
 908
 909 sub find_all_inputs {
 910     my $self = shift;
 911     my %criteria = @_;
 912
 913     my $form = $self->current_form() or return;
 914
 915     my @found;
 916     foreach my $input ( $form->inputs ) { # check every pattern for a match on the current hash
 917         my $matched = 1;
 918         foreach my $criterion ( sort keys %criteria ) { # Sort so we're deterministic
 919             my $field = $criterion;
 920             my $is_regex = ( $field =~ s/(?:_regex)$// );
 921             my $what = $input->{$field};
 922             $matched = defined($what) && (
 923                 $is_regex
 924                     ? ( $what =~ $criteria{$criterion} )
 925                     : ( $what eq $criteria{$criterion} )
 926                 );
 927             last if !$matched;
 928         }
 929         push @found, $input if $matched;
 930     }
 931     return @found;
 932 }
 933
 934 =head2 $mech->find_all_submits( ... criteria ... )
 935
 936 C<find_all_submits()> does the same thing as C<find_all_inputs()>
 937 except that it only returns controls that are submit controls,
 938 ignoring other types of input controls like text and checkboxes.
 939
 940 =cut
 941
 942 sub find_all_submits {
 943     my $self = shift;
 944
 945     return $self->find_all_inputs( @_, type_regex => qr/^(submit|image)$/ );
 946 }
 947
 948
 949 =head1 IMAGE METHODS
 950
 951 =head2 $mech->images
 952
 953 Lists all the images on the current page.  Each image is a
 954 WWW::Mechanize::Image object. In list context, returns a list of all
 955 images.  In scalar context, returns an array reference of all images.
 956
 957 =cut
 958
 959 sub images {
 960     my $self = shift ;
 961
 962     $self->_extract_images() unless $self->{_extracted_images};
 963
 964     return @{$self->{images}} if wantarray;
 965     return $self->{images};
 966 }
 967
 968 =head2 $mech->find_image()
 969
 970 Finds an image in the current page. It returns a
 971 L<WWW::Mechanize::Image> object which describes the image.  If it fails
 972 to find an image it returns undef.
 973
 974 You can select which image to find by passing in one or more of these
 975 key/value pairs:
 976
 977 =over 4
 978
 979 =item * C<< alt => 'string' >> and C<< alt_regex => qr/regex/, >>
 980
 981 C<alt> matches the ALT attribute of the image against I<string>, which must be an
 982 exact match. To select a image with an ALT tag that is exactly "download", use
 983
 984     $mech->find_image( alt => 'download' );
 985
 986 C<alt_regex> matches the ALT attribute of the image  against a regular
 987 expression.  To select an image with an ALT attribute that has "download"
 988 anywhere in it, regardless of case, use
 989
 990     $mech->find_image( alt_regex => qr/download/i );
 991
 992 =item * C<< url => 'string', >> and C<< url_regex => qr/regex/, >>
 993
 994 Matches the URL of the image against I<string> or I<regex>, as appropriate.
 995 The URL may be a relative URL, like F<foo/bar.html>, depending on how
 996 it's coded on the page.
 997
 998 =item * C<< url_abs => string >> and C<< url_abs_regex => regex >>
 999
1000 Matches the absolute URL of the image against I<string> or I<regex>,
1001 as appropriate.  The URL will be an absolute URL, even if it's relative
1002 in the page.
1003
1004 =item * C<< tag => string >> and C<< tag_regex => regex >>
1005
1006 Matches the tag that the image came from against I<string> or I<regex>,
1007 as appropriate.  The C<tag_regex> is probably most useful to check for
1008 more than one tag, as in:
1009
1010     $mech->find_image( tag_regex => qr/^(img|input)$/ );
1011
1012 The tags supported are C<< <img> >> and C<< <input> >>.
1013
1014 =back
1015
1016 If C<n> is not specified, it defaults to 1.  Therefore, if you don't
1017 specify any parms, this method defaults to finding the first image on the
1018 page.
1019
1020 Note that you can specify multiple ALT or URL parameters, which
1021 will be ANDed together.  For example, to find the first image with
1022 ALT text of "News" and with "cnn.com" in the URL, use:
1023
1024     $mech->find_image( image => 'News', url_regex => qr/cnn\.com/ );
1025
1026 The return value is a reference to an array containing a
1027 L<WWW::Mechanize::Image> object for every image in C<< $self->content >>.
1028
1029 =cut
1030
1031 sub find_image {
1032     my $self = shift;
1033     my %parms = ( n=>1, @_ );
1034
1035     my $wantall = ( $parms{n} eq 'all' );
1036
1037     $self->_clean_keys( \%parms, qr/^(n|(alt|url|url_abs|tag)(_regex)?)$/ );
1038
1039     my @images = $self->images or return;
1040
1041     my $nmatches = 0;
1042     my @matches;
1043     for my $image ( @images ) {
1044         if ( _match_any_image_parms($image,\%parms) ) {
1045             if ( $wantall ) {
1046                 push( @matches, $image );
1047             }
1048             else {
1049                 ++$nmatches;
1050                 return $image if $nmatches >= $parms{n};
1051             }
1052         }
1053     } # for @images
1054
1055     if ( $wantall ) {
1056         return @matches if wantarray;
1057         return \@matches;
1058     }
1059
1060     return;
1061 }
1062
1063 # Used by find_images to check for matches
1064 # The logic is such that ALL parm criteria that are given must match
1065 sub _match_any_image_parms {
1066     my $image = shift;
1067     my $p = shift;
1068
1069     # No conditions, anything matches
1070     return 1 unless keys %$p;
1071
1072     return if defined $p->{url}           && !($image->url eq $p->{url} );
1073     return if defined $p->{url_regex}     && !($image->url =~ $p->{url_regex} );
1074     return if defined $p->{url_abs}       && !($image->url_abs eq $p->{url_abs} );
1075     return if defined $p->{url_abs_regex} && !($image->url_abs =~ $p->{url_abs_regex} );
1076     return if defined $p->{alt}           && !(defined($image->alt) && $image->alt eq $p->{alt} );
1077     return if defined $p->{alt_regex}     && !(defined($image->alt) && $image->alt =~ $p->{alt_regex} );
1078     return if defined $p->{tag}           && !($image->tag && $image->tag eq $p->{tag} );
1079     return if defined $p->{tag_regex}     && !($image->tag && $image->tag =~ $p->{tag_regex} );
1080
1081     # Success: everything that was defined passed.
1082     return 1;
1083 }
1084
1085
1086 =head2 $mech->find_all_images( ... )
1087
1088 Returns all the images on the current page that match the criteria.  The
1089 method for specifying image criteria is the same as in C<L<find_image()>>.
1090 Each of the images returned is a L<WWW::Mechanize::Image> object.
1091
1092 In list context, C<find_all_images()> returns a list of the images.
1093 Otherwise, it returns a reference to the list of images.
1094
1095 C<find_all_images()> with no parameters returns all images in the page.
1096
1097 =cut
1098
1099 sub find_all_images {
1100     my $self = shift;
1101     return $self->find_image( @_, n=>'all' );
1102 }
1103
1104 =head1 FORM METHODS
1105
1106 =head2 $mech->forms
1107
1108 Lists all the forms on the current page.  Each form is an L<HTML::Form>
1109 object.  In list context, returns a list of all forms.  In scalar
1110 context, returns an array reference of all forms.
1111
1112 =cut
1113
1114 sub forms {
1115     my $self = shift ;
1116     return @{$self->{forms}} if wantarray;
1117     return $self->{forms};
1118 }
1119
1120
1121 =head2 $mech->form_number($number)
1122
1123 Selects the I<number>th form on the page as the target for subsequent
1124 calls to C<L<field()>> and C<L<click()>>.  Also returns the form that was
1125 selected.
1126
1127 If it is found, the form is returned as an L<HTML::Form> object and set internally
1128 for later use with Mech's form methods such as C<L<field()>> and C<L<click()>>.
1129
1130 Emits a warning and returns undef if no form is found.
1131
1132 The first form is number 1, not zero.
1133
1134 =cut
1135
1136 sub form_number {
1137     my ($self, $form) = @_;
1138     # XXX Should we die if no $form is defined? Same question for form_name()
1139
1140     if ($self->{forms}->[$form-1]) {
1141         $self->{form} = $self->{forms}->[$form-1];
1142         return $self->{form};
1143     }
1144     else {
1145         $self->warn( "There is no form numbered $form" );
1146         return undef;
1147     }
1148 }
1149
1150 =head2 $mech->form_name( $name )
1151
1152 Selects a form by name.  If there is more than one form on the page
1153 with that name, then the first one is used, and a warning is
1154 generated.
1155
1156 If it is found, the form is returned as an L<HTML::Form> object and set internally
1157 for later use with Mech's form methods such as C<L<field()>> and C<L<click()>>.
1158
1159 Returns undef if no form is found.
1160
1161 Note that this functionality requires libwww-perl 5.69 or higher.
1162
1163 =cut
1164
1165 sub form_name {
1166     my ($self, $form) = @_;
1167
1168     my $temp;
1169     my @matches = grep {defined($temp = $_->attr('name')) and ($temp eq $form) } $self->forms;
1170     if ( my $nmatches = @matches ) {
1171         $self->warn( "There are $nmatches forms named $form.  The first one was used." )
1172             if $nmatches > 1;
1173         return $self->{form} = $matches[0];
1174     }
1175     else {
1176         $self->warn( qq{ There is no form named "$form"} );
1177         return undef;
1178     }
1179 }
1180
1181 =head2 $mech->form_with_fields( @fields )
1182
1183 Selects a form by passing in a list of field names it must contain.  If there
1184 is more than one form on the page with that matches, then the first one is used,
1185 and a warning is generated.
1186
1187 If it is found, the form is returned as an L<HTML::Form> object and set internally
1188 for later used with Mech's form methods such as C<L<field()>> and C<L<click()>>.
1189
1190 Returns undef if no form is found.
1191
1192 Note that this functionality requires libwww-perl 5.69 or higher.
1193
1194 =cut
1195
1196 sub form_with_fields {
1197     my ($self, @fields) = @_;
1198     die 'no fields provided' unless scalar @fields;
1199
1200     my @matches;
1201     FORMS: for my $form (@{ $self->forms }) {
1202         my @fields_in_form = $form->param();
1203         for my $field (@fields) {
1204             next FORMS unless grep { $_ eq $field } @fields_in_form;
1205         }
1206         push @matches, $form;
1207     }
1208
1209     if ( my $nmatches = @matches ) {
1210         $self->warn( "There are $nmatches forms with the named fields.  The first one was used." )
1211             if $nmatches > 1;
1212         return $self->{form} = $matches[0];
1213     }
1214     else {
1215         $self->warn( qq{There is no form with the requested fields} );
1216         return undef;
1217     }
1218 }
1219
1220
1221 =head2 $mech->field( $name, $value, $number )
1222
1223 =head2 $mech->field( $name, \@values, $number )
1224
1225 Given the name of a field, set its value to the value specified.  This
1226 applies to the current form (as set by the L<form_name()> or L<form_number()> method or defaulting
1227 to the first form on the page).
1228
1229 The optional I<$number> parameter is used to distinguish between two fields
1230 with the same name.  The fields are numbered from 1.
1231
1232 =cut
1233
1234 sub field {
1235     my ($self, $name, $value, $number) = @_;
1236     $number ||= 1;
1237
1238     my $form = $self->{form};
1239     if ($number > 1) {
1240         $form->find_input($name, undef, $number)->value($value);
1241     }
1242     else {
1243         if ( ref($value) eq 'ARRAY' ) {
1244             $form->param($name, $value);
1245         }
1246         else {
1247             $form->value($name => $value);
1248         }
1249     }
1250 }
1251
1252 =head2 $mech->select($name, $value)
1253
1254 =head2 $mech->select($name, \@values)
1255
1256 Given the name of a C<select> field, set its value to the value
1257 specified.  If the field is not E<lt>select multipleE<gt> and the
1258 C<$value> is an array, only the B<first> value will be set.  [Note:
1259 the documentation previously claimed that only the last value would
1260 be set, but this was incorrect.]  Passing C<$value> as a hash with
1261 an C<n> key selects an item by number (e.g. C<{n => 3> or C<{n => [2,4]}>).
1262 The numbering starts at 1.  This applies to the current form.
1263
1264 Returns 1 on successfully setting the value. On failure, returns
1265 undef and calls C<< $self>warn() >> with an error message.
1266
1267 =cut
1268
1269 sub select {
1270     my ($self, $name, $value) = @_;
1271
1272     my $form = $self->{form};
1273
1274     my $input = $form->find_input($name);
1275     if (!$input) {
1276         $self->warn( qq{Input "$name" not found} );
1277         return;
1278     }
1279
1280     if ($input->type ne 'option') {
1281         $self->warn( qq{Input "$name" is not type "select"} );
1282         return;
1283     }
1284
1285     # For $mech->select($name, {n => 3}) or $mech->select($name, {n => [2,4]}),
1286     # transform the 'n' number(s) into value(s) and put it in $value.
1287     if (ref($value) eq 'HASH') {
1288         for (keys %$value) {
1289             $self->warn(qq{Unknown select value parameter "$_"})
1290               unless $_ eq 'n';
1291         }
1292
1293         if (defined($value->{n})) {
1294             my @inputs = $form->find_input($name, 'option');
1295             my @values = ();
1296             # distinguish between multiple and non-multiple selects
1297             # (see INPUTS section of `perldoc HTML::Form`)
1298             if (@inputs == 1) {
1299                 @values = $inputs[0]->possible_values();
1300             }
1301             else {
1302                 foreach my $input (@inputs) {
1303                     my @possible = $input->possible_values();
1304                     push @values, pop @possible;
1305                 }
1306             }
1307
1308             my $n = $value->{n};
1309             if (ref($n) eq 'ARRAY') {
1310                 $value = [];
1311                 for (@$n) {
1312                     unless (/^\d+$/) {
1313                         $self->warn(qq{"n" value "$_" is not a positive integer});
1314                         return;
1315                     }
1316                     push @$value, $values[$_ - 1];  # might be undef
1317                 }
1318             }
1319             elsif (!ref($n) && $n =~ /^\d+$/) {
1320                 $value = $values[$n - 1];           # might be undef
1321             }
1322             else {
1323                 $self->warn('"n" value is not a positive integer or an array ref');
1324                 return;
1325             }
1326         }
1327         else {
1328             $self->warn('Hash value is invalid');
1329             return;
1330         }
1331     } # hashref
1332
1333     if (ref($value) eq 'ARRAY') {
1334         $form->param($name, $value);
1335         return 1;
1336     }
1337
1338     $form->value($name => $value);
1339     return 1;
1340 }
1341
1342 =head2 $mech->set_fields( $name => $value ... )
1343
1344 This method sets multiple fields of the current form. It takes a list
1345 of field name and value pairs. If there is more than one field with
1346 the same name, the first one found is set. If you want to select which
1347 of the duplicate field to set, use a value which is an anonymous array
1348 which has the field value and its number as the 2 elements.
1349
1350         # set the second foo field
1351         $mech->set_fields( $name => [ 'foo', 2 ] ) ;
1352
1353 The fields are numbered from 1.
1354
1355 This applies to the current form.
1356
1357 =cut
1358
1359 sub set_fields {
1360     my $self = shift;
1361     my %fields = @_;
1362
1363     my $form = $self->current_form or $self->die( 'No form defined' );
1364
1365     while ( my ( $field, $value ) = each %fields ) {
1366         if ( ref $value eq 'ARRAY' ) {
1367             $form->find_input( $field, undef,
1368                          $value->[1])->value($value->[0] );
1369         }
1370         else {
1371             $form->value($field => $value);
1372         }
1373     } # while
1374 } # set_fields()
1375
1376 =head2 $mech->set_visible( @criteria )
1377
1378 This method sets fields of the current form without having to know
1379 their names.  So if you have a login screen that wants a username and
1380 password, you do not have to fetch the form and inspect the source (or
1381 use the F<mech-dump> utility, installed with WWW::Mechanize) to see
1382 what the field names are; you can just say
1383
1384     $mech->set_visible( $username, $password ) ;
1385
1386 and the first and second fields will be set accordingly.  The method
1387 is called set_I<visible> because it acts only on visible fields;
1388 hidden form inputs are not considered.  The order of the fields is
1389 the order in which they appear in the HTML source which is nearly
1390 always the order anyone viewing the page would think they are in,
1391 but some creative work with tables could change that; caveat user.
1392
1393 Each element in C<@criteria> is either a field value or a field
1394 specifier.  A field value is a scalar.  A field specifier allows
1395 you to specify the I<type> of input field you want to set and is
1396 denoted with an arrayref containing two elements.  So you could
1397 specify the first radio button with
1398
1399     $mech->set_visible( [ radio => 'KCRW' ] ) ;
1400
1401 Field values and specifiers can be intermixed, hence
1402
1403     $mech->set_visible( 'fred', 'secret', [ option => 'Checking' ] ) ;
1404
1405 would set the first two fields to "fred" and "secret", and the I<next>
1406 C<OPTION> menu field to "Checking".
1407
1408 The possible field specifier types are: "text", "password", "hidden",
1409 "textarea", "file", "image", "submit", "radio", "checkbox" and "option".
1410
1411 C<set_visible> returns the number of values set.
1412
1413 =cut
1414
1415 sub set_visible {
1416     my $self = shift;
1417
1418     my $form = $self->current_form;
1419     my @inputs = $form->inputs;
1420
1421     my $num_set = 0;
1422     for my $value ( @_ ) {
1423         # Handle type/value pairs an arrayref
1424         if ( ref $value eq 'ARRAY' ) {
1425             my ( $type, $value ) = @$value;
1426             while ( my $input = shift @inputs ) {
1427                 next if $input->type eq 'hidden';
1428                 if ( $input->type eq $type ) {
1429                     $input->value( $value );
1430                     $num_set++;
1431                     last;
1432                 }
1433             } # while
1434         }
1435         # by default, it's a value
1436         else {
1437             while ( my $input = shift @inputs ) {
1438                 next if $input->type eq 'hidden';
1439                 $input->value( $value );
1440                 $num_set++;
1441                 last;
1442             } # while
1443         }
1444     } # for
1445
1446     return $num_set;
1447 } # set_visible()
1448
1449 =head2 $mech->tick( $name, $value [, $set] )
1450
1451 "Ticks" the first checkbox that has both the name and value associated
1452 with it on the current form.  Dies if there is no named check box for
1453 that value.  Passing in a false value as the third optional argument
1454 will cause the checkbox to be unticked.
1455
1456 =cut
1457
1458 sub tick {
1459     my $self = shift;
1460     my $name = shift;
1461     my $value = shift;
1462     my $set = @_ ? shift : 1;  # default to 1 if not passed
1463
1464     # loop though all the inputs
1465     my $index = 0;
1466     while ( my $input = $self->current_form->find_input( $name, 'checkbox', $index ) ) {
1467         # Can't guarantee that the first element will be undef and the second
1468         # element will be the right name
1469         foreach my $val ($input->possible_values()) {
1470             next unless defined $val;
1471             if ($val eq $value) {
1472                 $input->value($set ? $value : undef);
1473                 return;
1474             }
1475         }
1476
1477         # move onto the next input
1478         $index++;
1479     } # while
1480
1481     # got self far?  Didn't find anything
1482     $self->warn( qq{No checkbox "$name" for value "$value" in form} );
1483 } # tick()
1484
1485 =head2 $mech->untick($name, $value)
1486
1487 Causes the checkbox to be unticked.  Shorthand for
1488 C<tick($name,$value,undef)>
1489
1490 =cut
1491
1492 sub untick {
1493     shift->tick(shift,shift,undef);
1494 }
1495
1496 =head2 $mech->value( $name, $number )
1497
1498 Given the name of a field, return its value. This applies to the current
1499 form.
1500
1501 The option I<$number> parameter is used to distinguish between two fields
1502 with the same name.  The fields are numbered from 1.
1503
1504 If the field is of type file (file upload field), the value is always
1505 cleared to prevent remote sites from downloading your local files.
1506 To upload a file, specify its file name explicitly.
1507
1508 =cut
1509
1510 sub value {
1511     my $self = shift;
1512     my $name = shift;
1513     my $number = shift || 1;
1514
1515     my $form = $self->{form};
1516     if ( $number > 1 ) {
1517         return $form->find_input( $name, undef, $number )->value();
1518     }
1519     else {
1520         return $form->value( $name );
1521     }
1522 } # value
1523
1524 =head2 $mech->click( $button [, $x, $y] )
1525
1526 Has the effect of clicking a button on the current form.  The first
1527 argument is the name of the button to be clicked.  The second and
1528 third arguments (optional) allow you to specify the (x,y) coordinates
1529 of the click.
1530
1531 If there is only one button on the form, C<< $mech->click() >> with
1532 no arguments simply clicks that one button.
1533
1534 Returns an L<HTTP::Response> object.
1535
1536 =cut
1537
1538 sub click {
1539     my ($self, $button, $x, $y) = @_;
1540     for ($x, $y) { $_ = 1 unless defined; }
1541     my $request = $self->{form}->click($button, $x, $y);
1542     return $self->request( $request );
1543 }
1544
1545 =head2 $mech->click_button( ... )
1546
1547 Has the effect of clicking a button on the current form by specifying
1548 its name, value, or index.  Its arguments are a list of key/value
1549 pairs.  Only one of name, number, input or value must be specified in
1550 the keys.
1551
1552 =over 4
1553
1554 =item * name => name
1555
1556 Clicks the button named I<name> in the current form.
1557
1558 =item * number => n
1559
1560 Clicks the I<n>th button in the current form. Numbering starts at 1.
1561
1562 =item * value => value
1563
1564 Clicks the button with the value I<value> in the current form.
1565
1566 =item * input => $inputobject
1567
1568 Clicks on the button referenced by $inputobject, an instance of
1569 L<HTML::Form::SubmitInput> obtained e.g. from
1570
1571     $mech->current_form()->find_input( undef, 'submit' )
1572
1573 $inputobject must belong to the current form.
1574
1575 =item * x => x
1576
1577 =item * y => y
1578
1579 These arguments (optional) allow you to specify the (x,y) coordinates
1580 of the click.
1581
1582 =back
1583
1584 =cut
1585
1586 sub click_button {
1587     my $self = shift;
1588     my %args = @_;
1589
1590     for ( keys %args ) {
1591         if ( !/^(number|name|value|input|x|y)$/ ) {
1592             $self->warn( qq{Unknown click_button parameter "$_"} );
1593         }
1594     }
1595
1596     for ($args{x}, $args{y}) {
1597         $_ = 1 unless defined;
1598     }
1599
1600     my $form = $self->{form};
1601     my $request;
1602     if ( $args{name} ) {
1603         $request = $form->click( $args{name}, $args{x}, $args{y} );
1604     }
1605     elsif ( $args{number} ) {
1606         my $input = $form->find_input( undef, 'submit', $args{number} );
1607         $request = $input->click( $form, $args{x}, $args{y} );
1608     }
1609     elsif ( $args{input} ) {
1610         $request = $args{input}->click( $form, $args{x}, $args{y} );
1611     }
1612     elsif ( $args{value} ) {
1613         my $i = 1;
1614         while ( my $input = $form->find_input(undef, 'submit', $i) ) {
1615             if ( $args{value} && ($args{value} eq $input->value) ) {
1616                 $request = $input->click( $form, $args{x}, $args{y} );
1617                 last;
1618             }
1619             $i++;
1620         } # while
1621     } # $args{value}
1622
1623     return $self->request( $request );
1624 }
1625
1626 =head2 $mech->submit()
1627
1628 Submits the page, without specifying a button to click.  Actually,
1629 no button is clicked at all.
1630
1631 Returns an L<HTTP::Response> object.
1632
1633 This used to be a synonym for C<< $mech->click( 'submit' ) >>, but is no
1634 longer so.
1635
1636 =cut
1637
1638 sub submit {
1639     my $self = shift;
1640
1641     my $request = $self->{form}->make_request;
1642     return $self->request( $request );
1643 }
1644
1645 =head2 $mech->submit_form( ... )
1646
1647 This method lets you select a form from the previously fetched page,
1648 fill in its fields, and submit it. It combines the form_number/form_name,
1649 set_fields and click methods into one higher level call. Its arguments
1650 are a list of key/value pairs, all of which are optional.
1651
1652 =over 4
1653
1654 =item * fields => \%fields
1655
1656 Specifies the fields to be filled in the current form.
1657
1658 =item * with_fields => \%fields
1659
1660 Probably all you need for the common case. It combines a smart form selector
1661 and data setting in one operation. It selects the first form that contains all
1662 fields mentioned in C<\%fields>.  This is nice because you don't need to know
1663 the name or number of the form to do this.
1664
1665 (calls C<L<form_with_fields>> and C<L<set_fields()>>).
1666
1667 If you choose this, the form_number, form_name and fields options will be ignored.
1668
1669 =item * form_number => n
1670
1671 Selects the I<n>th form (calls C<L<form_number()>>).  If this parm is not
1672 specified, the currently-selected form is used.
1673
1674 =item * form_name => name
1675
1676 Selects the form named I<name> (calls C<L<form_name()>>)
1677
1678 =item * button => button
1679
1680 Clicks on button I<button> (calls C<L<click()>>)
1681
1682 =item * x => x, y => y
1683
1684 Sets the x or y values for C<L<click()>>
1685
1686 =back
1687
1688 If no form is selected, the first form found is used.
1689
1690 If I<button> is not passed, then the C<L<submit()>> method is used instead.
1691
1692 Returns an L<HTTP::Response> object.
1693
1694 =cut
1695
1696 sub submit_form {
1697     my( $self, %args ) = @_ ;
1698
1699     for ( keys %args ) {
1700         if ( !/^(form_(number|name|fields)|(with_)?fields|button|x|y)$/ ) {
1701             # XXX Why not die here?
1702             $self->warn( qq{Unknown submit_form parameter "$_"} );
1703         }
1704     }
1705
1706     my $fields;
1707     for (qw/with_fields fields/) {
1708         if ($args{$_}) {
1709             if ( ref $args{$_} eq 'HASH' ) {
1710                 $fields = $args{$_};
1711             }
1712             else {
1713                 die "$_ arg to submit_form must be a hashref";
1714             }
1715             last;
1716         }
1717     }
1718
1719     if ($args{'with_fields'}) {
1720         $fields || die q{must submit some 'fields' with with_fields};
1721         $self->form_with_fields(keys %{$fields}) or die;
1722     }
1723     elsif ( my $form_number = $args{'form_number'} ) {
1724         $self->form_number( $form_number ) or die;
1725     }
1726     elsif ( my $form_name = $args{'form_name'} ) {
1727         $self->form_name( $form_name ) or die;
1728     }
1729     else {
1730         # No form selector was used.
1731         # Maybe a form was set separately, or we'll default to the first form.
1732     }
1733
1734     $self->set_fields( %{$fields} ) if $fields;
1735
1736     my $response;
1737     if ( $args{button} ) {
1738         $response = $self->click( $args{button}, $args{x} || 0, $args{y} || 0 );
1739     }
1740     else {
1741         $response = $self->submit();
1742     }
1743
1744     return $response;
1745 }
1746
1747 =head1 MISCELLANEOUS METHODS
1748
1749 =head2 $mech->add_header( name => $value [, name => $value... ] )
1750
1751 Sets HTTP headers for the agent to add or remove from the HTTP request.
1752
1753     $mech->add_header( Encoding => 'text/klingon' );
1754
1755 If a I<value> is C<undef>, then that header will be removed from any
1756 future requests.  For example, to never send a Referer header:
1757
1758     $mech->add_header( Referer => undef );
1759
1760 If you want to delete a header, use C<delete_header>.
1761
1762 Returns the number of name/value pairs added.
1763
1764 B<NOTE>: This method was very different in WWW::Mechanize before 1.00.
1765 Back then, the headers were stored in a package hash, not as a member of
1766 the object instance.  Calling C<add_header()> would modify the headers
1767 for every WWW::Mechanize object, even after your object no longer existed.
1768
1769 =cut
1770
1771 sub add_header {
1772     my $self = shift;
1773     my $npairs = 0;
1774
1775     while ( @_ ) {
1776         my $key = shift;
1777         my $value = shift;
1778         ++$npairs;
1779
1780         $self->{headers}{$key} = $value;
1781     }
1782
1783     return $npairs;
1784 }
1785
1786 =head2 $mech->delete_header( name [, name ... ] )
1787
1788 Removes HTTP headers from the agent's list of special headers.  For
1789 instance, you might need to do something like:
1790
1791     # Don't send a Referer for this URL
1792     $mech->add_header( Referer => undef );
1793
1794     # Get the URL
1795     $mech->get( $url );
1796
1797     # Back to the default behavior
1798     $mech->delete_header( 'Referer' );
1799
1800 =cut
1801
1802 sub delete_header {
1803     my $self = shift;
1804
1805     while ( @_ ) {
1806         my $key = shift;
1807
1808         delete $self->{headers}{$key};
1809     }
1810
1811     return;
1812 }
1813
1814
1815 =head2 $mech->quiet(true/false)
1816
1817 Allows you to suppress warnings to the screen.
1818
1819     $mech->quiet(0); # turns on warnings (the default)
1820     $mech->quiet(1); # turns off warnings
1821     $mech->quiet();  # returns the current quietness status
1822
1823 =cut
1824
1825 sub quiet {
1826     my $self = shift;
1827
1828     $self->{quiet} = $_[0] if @_;
1829
1830     return $self->{quiet};
1831 }
1832
1833 =head2 $mech->stack_depth( $max_depth )
1834
1835 Get or set the page stack depth. Use this if you're doing a lot of page
1836 scraping and running out of memory.
1837
1838 A value of 0 means "no history at all."  By default, the max stack depth
1839 is humongously large, effectively keeping all history.
1840
1841 =cut
1842
1843 sub stack_depth {
1844     my $self = shift;
1845     $self->{stack_depth} = shift if @_;
1846     return $self->{stack_depth};
1847 }
1848
1849 =head2 $mech->save_content( $filename )
1850
1851 Dumps the contents of C<< $mech->content >> into I<$filename>.
1852 I<$filename> will be overwritten.  Dies if there are any errors.
1853
1854 =cut
1855
1856 sub save_content {
1857     my $self = shift;
1858     my $filename = shift;
1859
1860     open( my $fh, '>', $filename ) or $self->die( "Unable to create $filename: $!" );
1861     print {$fh} $self->content or $self->die( "Unable to write to $filename: $!" );
1862     close $fh or $self->die( "Unable to close $filename: $!" );
1863
1864     return;
1865 }
1866
1867 =head2 $mech->dump_links( [[$fh], $absolute] )
1868
1869 Prints a dump of the links on the current page to I<$fh>.  If I<$fh>
1870 is not specified or is undef, it dumps to STDOUT.
1871
1872 If I<$absolute> is true, links displayed are absolute, not relative.
1873
1874 =cut
1875
1876 sub dump_links {
1877     my $self = shift;
1878     my $fh = shift || \*STDOUT;
1879     my $absolute = shift;
1880
1881     for my $link ( $self->links ) {
1882         my $url = $absolute ? $link->url_abs : $link->url;
1883         $url = '' if not defined $url;
1884         print {$fh} $url, "\n";
1885     }
1886     return;
1887 }
1888
1889 =head2 $mech->dump_images( [[$fh], $absolute] )
1890
1891 Prints a dump of the images on the current page to I<$fh>.  If I<$fh>
1892 is not specified or is undef, it dumps to STDOUT.
1893
1894 If I<$absolute> is true, links displayed are absolute, not relative.
1895
1896 =cut
1897
1898 sub dump_images {
1899     my $self = shift;
1900     my $fh = shift || \*STDOUT;
1901     my $absolute = shift;
1902
1903     for my $image ( $self->images ) {
1904         my $url = $absolute ? $image->url_abs : $image->url;
1905         $url = '' if not defined $url;
1906         print {$fh} $url, "\n";
1907     }
1908     return;
1909 }
1910
1911 =head2 $mech->dump_forms( [$fh] )
1912
1913 Prints a dump of the forms on the current page to I<$fh>.  If I<$fh>
1914 is not specified or is undef, it dumps to STDOUT.
1915
1916 =cut
1917
1918 sub dump_forms {
1919     my $self = shift;
1920     my $fh = shift || \*STDOUT;
1921
1922     for my $form ( $self->forms ) {
1923         print {$fh} $form->dump, "\n";
1924     }
1925     return;
1926 }
1927
1928 =head2 $mech->dump_all( [[$fh], $absolute] )
1929
1930 Prints a dump of all links, images and forms on the current page to
1931 I<$fh>.  If I<$fh> is not specified or is undef, it dumps to STDOUT.
1932
1933 If I<$absolute> is true, links displayed are absolute, not relative.
1934
1935 =cut
1936
1937 sub dump_all {
1938     my $self = shift;
1939     my $fh = shift || \*STDOUT;
1940     my $absolute = shift;
1941
1942     $self->dump_links( $fh, $absolute );
1943     $self->dump_images( $fh, $absolute );
1944     $self->dump_forms( $fh, $absolute );
1945
1946     return;
1947 }
1948
1949
1950 =head1 OVERRIDDEN LWP::UserAgent METHODS
1951
1952 =head2 $mech->clone()
1953
1954 Clone the mech object. We override here to be sure the cookie jar
1955 gets copied over
1956
1957 =cut
1958
1959 sub clone {
1960     my $self = shift;
1961     my $clone =  $self->SUPER::clone();
1962     $clone->{cookie_jar} = $self->cookie_jar;
1963     return $clone;
1964 }
1965
1966
1967 =head2 $mech->redirect_ok()
1968
1969 An overloaded version of C<redirect_ok()> in L<LWP::UserAgent>.
1970 This method is used to determine whether a redirection in the request
1971 should be followed.
1972
1973 =cut
1974
1975 sub redirect_ok {
1976     my $self = shift;
1977     my $prospective_request = shift;
1978     my $response = shift;
1979
1980     my $ok = $self->SUPER::redirect_ok( $prospective_request, $response );
1981     if ( $ok ) {
1982         $self->{redirected_uri} = $prospective_request->uri;
1983     }
1984
1985     return $ok;
1986 }
1987
1988
1989 =head2 $mech->request( $request [, $arg [, $size]])
1990
1991 Overloaded version of C<request()> in L<LWP::UserAgent>.  Performs
1992 the actual request.  Normally, if you're using WWW::Mechanize, it's
1993 because you don't want to deal with this level of stuff anyway.
1994
1995 Note that C<$request> will be modified.
1996
1997 Returns an L<HTTP::Response> object.
1998
1999 =cut
2000
2001 sub request {
2002     my $self = shift;
2003     my $request = shift;
2004
2005     $request = $self->_modify_request( $request );
2006
2007     if ( $request->method eq 'GET' || $request->method eq 'POST' ) {
2008         $self->_push_page_stack();
2009     }
2010
2011     $self->_update_page($request, $self->_make_request( $request, @_ ));
2012
2013     # XXX This should definitively return something.
2014 }
2015
2016 =head2 $mech->update_html( $html )
2017
2018 Allows you to replace the HTML that the mech has found.  Updates the
2019 forms and links parse-trees that the mech uses internally.
2020
2021 Say you have a page that you know has malformed output, and you want to
2022 update it so the links come out correctly:
2023
2024     my $html = $mech->content;
2025     $html =~ s[</option>.{0,3}</td>][</option></select></td>]isg;
2026     $mech->update_html( $html );
2027
2028 This method is also used internally by the mech itself to update its
2029 own HTML content when loading a page. This means that if you would
2030 like to I<systematically> perform the above HTML substitution, you
2031 would overload I<update_html> in a subclass thusly:
2032
2033    package MyMech;
2034    use base 'WWW::Mechanize';
2035
2036    sub update_html {
2037        my ($self, $html) = @_;
2038        $html =~ s[</option>.{0,3}</td>][</option></select></td>]isg;
2039        $self->WWW::Mechanize::update_html( $html );
2040    }
2041
2042 If you do this, then the mech will use the tidied-up HTML instead of
2043 the original both when parsing for its own needs, and for returning to
2044 you through L</content>.
2045
2046 Overloading this method is also the recommended way of implementing
2047 extra validation steps (e.g. link checkers) for every HTML page
2048 received.  L</warn> and L</die> would then come in handy to signal
2049 validation errors.
2050
2051 =cut
2052
2053 sub update_html {
2054     my $self = shift;
2055     my $html = shift;
2056
2057     $self->_reset_page;
2058     $self->{ct} = 'text/html';
2059     $self->{content} = $html;
2060
2061     $self->{forms} = [ HTML::Form->parse($html, $self->base) ];
2062     for my $form (@{ $self->{forms} }) {
2063         for my $input ($form->inputs) {
2064              if ($input->type eq 'file') {
2065                  $input->value( undef );
2066              }
2067         }
2068     }
2069     $self->{form}  = $self->{forms}->[0];
2070     $self->{_extracted_links} = 0;
2071     $self->{_extracted_images} = 0;
2072
2073     return;
2074 }
2075
2076 =head2 $mech->credentials( $username, $password )
2077
2078 Provide credentials to be used for HTTP Basic authentication for all sites and
2079 realms until further notice.
2080
2081 The four argument form described in L<LWP::UserAgent> is still supported.
2082
2083 =cut
2084
2085 {
2086     my $saved_method;
2087
2088     sub credentials {
2089         my $self = shift;
2090         no warnings 'redefine'; ## no critic
2091
2092         if (@_ == 4) {
2093             $saved_method
2094                 and *LWP::UserAgent::get_basic_credentials = $saved_method;
2095             return $self->SUPER::credentials(@_);
2096         }
2097
2098         @_ == 2
2099             or $self->die( 'Invalid # of args for overridden credentials()' );
2100
2101         my ($username, $password) = @_;
2102         $saved_method ||= \&LWP::UserAgent::get_basic_credentials;
2103         *LWP::UserAgent::get_basic_credentials
2104             = sub { return $username, $password };
2105     }
2106 }
2107
2108
2109 =head1 INTERNAL-ONLY METHODS
2110
2111 These methods are only used internally.  You probably don't need to
2112 know about them.
2113
2114 =head2 $mech->_update_page($request, $response)
2115
2116 Updates all internal variables in $mech as if $request was just
2117 performed, and returns $response. The page stack is B<not> altered by
2118 this method, it is up to caller (e.g. L</request>) to do that.
2119
2120 =cut
2121
2122 sub _update_page {
2123     my ($self, $request, $res) = @_;
2124
2125     $self->{req} = $request;
2126     $self->{redirected_uri} = $request->uri->as_string;
2127
2128     $self->{res} = $res;
2129
2130     $self->{status}  = $res->code;
2131     $self->{base}    = $res->base;
2132     $self->{ct}      = $res->content_type || '';
2133
2134     if ( $res->is_success ) {
2135         $self->{uri} = $self->{redirected_uri};
2136         $self->{last_uri} = $self->{uri};
2137     }
2138
2139     if ( $res->is_error ) {
2140         if ( $self->{autocheck} ) {
2141             $self->die( 'Error ', $request->method, 'ing ', $request->uri, ': ', $res->message );
2142         }
2143     }
2144
2145     $self->_reset_page;
2146
2147     # Try to decode the content. Undef will be returned if there's nothing to decompress.
2148     # See docs in HTTP::Message for details. Do we need to expose the options there?
2149     # use charset => 'none' because while we want LWP to handle Content-Encoding for
2150     # the auto-gzipping with Compress::Zlib we don't want it messing with charset
2151     my $content = $res->decoded_content( charset => 'none' );
2152     $content = $res->content if (not defined $content);
2153
2154     $content .= _taintedness();
2155
2156     if ($self->is_html) {
2157         $self->update_html($content);
2158     }
2159     else {
2160         $self->{content} = $content;
2161     }
2162
2163     return $res;
2164 } # _update_page
2165
2166 our $_taintbrush;
2167
2168 # This is lifted wholesale from Test::Taint
2169 sub _taintedness {
2170     return $_taintbrush if defined $_taintbrush;
2171
2172     # Somehow we need to get some taintedness into our $_taintbrush.
2173     # Let's try the easy way first. Either of these should be
2174     # tainted, unless somebody has untainted them, so this
2175     # will almost always work on the first try.
2176     # (Unless, of course, taint checking has been turned off!)
2177     $_taintbrush = substr("$0$^X", 0, 0);
2178     return $_taintbrush if _is_tainted( $_taintbrush );
2179
2180     # Let's try again. Maybe somebody cleaned those.
2181     $_taintbrush = substr(join("", @ARGV, %ENV), 0, 0);
2182     return $_taintbrush if _is_tainted( $_taintbrush );
2183
2184     # If those don't work, go try to open some file from some unsafe
2185     # source and get data from them.  That data is tainted.
2186     # (Yes, even reading from /dev/null works!)
2187     for my $filename ( qw(/dev/null / . ..), values %INC, $0, $^X ) {
2188         if ( open my $fh, '<', $filename ) {
2189             my $data;
2190             if ( defined sysread $fh, $data, 1 ) {
2191                 $_taintbrush = substr( $data, 0, 0 );
2192                 last if _is_tainted( $_taintbrush );
2193             }
2194         }
2195     }
2196
2197     # Sanity check
2198     die "Our taintbrush should have zero length!" if length $_taintbrush;
2199
2200     return $_taintbrush;
2201 }
2202
2203 sub _is_tainted {
2204     no warnings qw(void uninitialized);
2205
2206     return !eval { join('', shift), kill 0; 1 };
2207 } # _is_tainted
2208
2209
2210 =head2 $mech->_modify_request( $req )
2211
2212 Modifies a L<HTTP::Request> before the request is sent out,
2213 for both GET and POST requests.
2214
2215 We add a C<Referer> header, as well as header to note that we can accept gzip
2216 encoded content, if L<Compress::Zlib> is installed.
2217
2218 =cut
2219
2220 sub _modify_request {
2221     my $self = shift;
2222     my $req = shift;
2223
2224     # add correct Accept-Encoding header to restore compliance with
2225     # http://www.freesoft.org/CIE/RFC/2068/158.htm
2226     # http://use.perl.org/~rhesa/journal/25952
2227     if (not $req->header( 'Accept-Encoding' ) ) {
2228         # "identity" means "please! unencoded content only!"
2229         $req->header( 'Accept-Encoding', $HAS_ZLIB ? 'gzip' : 'identity' );
2230     }
2231
2232     my $last = $self->{last_uri};
2233     if ( $last ) {
2234         $last = $last->as_string if ref($last);
2235         $req->header( Referer => $last );
2236     }
2237     while ( my($key,$value) = each %{$self->{headers}} ) {
2238         if ( defined $value ) {
2239             $req->header( $key => $value );
2240         }
2241         else {
2242             $req->remove_header( $key );
2243         }
2244     }
2245
2246     return $req;
2247 }
2248
2249
2250 =head2 $mech->_make_request()
2251
2252 Convenience method to make it easier for subclasses like
2253 L<WWW::Mechanize::Cached> to intercept the request.
2254
2255 =cut
2256
2257 sub _make_request {
2258     my $self = shift;
2259     $self->SUPER::request(@_);
2260 }
2261
2262 =head2 $mech->_reset_page()
2263
2264 Resets the internal fields that track page parsed stuff.
2265
2266 =cut
2267
2268 sub _reset_page {
2269     my $self = shift;
2270
2271     $self->{_extracted_links} = 0;
2272     $self->{_extracted_images} = 0;
2273     $self->{links} = [];
2274     $self->{images} = [];
2275     $self->{forms} = [];
2276     delete $self->{form};
2277
2278     return;
2279 }
2280
2281 =head2 $mech->_extract_links()
2282
2283 Extracts links from the content of a webpage, and populates the C<{links}>
2284 property with L<WWW::Mechanize::Link> objects.
2285
2286 =cut
2287
2288 my %link_tags = (
2289     a => 'href',
2290     area => 'href',
2291     frame => 'src',
2292     iframe => 'src',
2293     meta => 'content',
2294 );
2295
2296 sub _extract_links {
2297     my $self = shift;
2298
2299
2300     $self->{links} = [];
2301     if ( defined $self->{content} ) {
2302         my $parser = HTML::TokeParser->new(\$self->{content});
2303         while ( my $token = $parser->get_tag( keys %link_tags ) ) {
2304             my $link = $self->_link_from_token( $token, $parser );
2305             push( @{$self->{links}}, $link ) if $link;
2306         } # while
2307     }
2308
2309     $self->{_extracted_links} = 1;
2310
2311     return;
2312 }
2313
2314
2315 my %image_tags = (
2316     img => 'src',
2317     input => 'src',
2318 );
2319
2320 sub _extract_images {
2321     my $self = shift;
2322
2323     $self->{images} = [];
2324
2325     if ( defined $self->{content} ) {
2326         my $parser = HTML::TokeParser->new(\$self->{content});
2327         while ( my $token = $parser->get_tag( keys %image_tags ) ) {
2328             my $image = $self->_image_from_token( $token, $parser );
2329             push( @{$self->{images}}, $image ) if $image;
2330         } # while
2331     }
2332
2333     $self->{_extracted_images} = 1;
2334
2335     return;
2336 }
2337
2338 sub _image_from_token {
2339     my $self = shift;
2340     my $token = shift;
2341     my $parser = shift;
2342
2343     my $tag = $token->[0];
2344     my $attrs = $token->[1];
2345
2346     if ( $tag eq 'input' ) {
2347         my $type = $attrs->{type} or return;
2348         return unless $type eq 'image';
2349     }
2350
2351     require WWW::Mechanize::Image;
2352     return
2353         WWW::Mechanize::Image->new({
2354             tag     => $tag,
2355             base    => $self->base,
2356             url     => $attrs->{src},
2357             name    => $attrs->{name},
2358             height  => $attrs->{height},
2359             width   => $attrs->{width},
2360             alt     => $attrs->{alt},
2361         });
2362 }
2363
2364 sub _link_from_token {
2365     my $self = shift;
2366     my $token = shift;
2367     my $parser = shift;
2368
2369     my $tag = $token->[0];
2370     my $attrs = $token->[1];
2371     my $url = $attrs->{$link_tags{$tag}};
2372
2373     my $text;
2374     my $name;
2375     if ( $tag eq 'a' ) {
2376         $text = $parser->get_trimmed_text("/$tag");
2377         $text = '' unless defined $text;
2378
2379         my $onClick = $attrs->{onclick};
2380         if ( $onClick && ($onClick =~ /^window\.open\(\s*'([^']+)'/) ) {
2381             $url = $1;
2382         }
2383     } # a
2384
2385     # Of the tags we extract from, only 'AREA' has an alt tag
2386     # The rest should have a 'name' attribute.
2387     # ... but we don't do anything with that bit of wisdom now.
2388
2389     $name = $attrs->{name};
2390
2391     if ( $tag eq 'meta' ) {
2392         my $equiv = $attrs->{'http-equiv'};
2393         my $content = $attrs->{'content'};
2394         return unless $equiv && (lc $equiv eq 'refresh') && defined $content;
2395
2396         if ( $content =~ /^\d+\s*;\s*url\s*=\s*(\S+)/i ) {
2397             $url = $1;
2398             $url =~ s/^"(.+)"$/$1/ or $url =~ s/^'(.+)'$/$1/;
2399         }
2400         else {
2401             undef $url;
2402         }
2403     } # meta
2404
2405     return unless defined $url;   # probably just a name link or <AREA NOHREF...>
2406
2407     require WWW::Mechanize::Link;
2408     return
2409         WWW::Mechanize::Link->new({
2410             url  => $url,
2411             text => $text,
2412             name => $name,
2413             tag  => $tag,
2414             base => $self->base,
2415             attrs => $attrs,
2416         });
2417 } # _link_from_token
2418
2419 =head2 $mech->_push_page_stack() / $mech->_pop_page_stack()
2420
2421 The agent keeps a stack of visited pages, which it can pop when it needs
2422 to go BACK and so on.
2423
2424 The current page needs to be pushed onto the stack before we get a new
2425 page, and the stack needs to be popped when BACK occurs.
2426
2427 Neither of these take any arguments, they just operate on the $mech
2428 object.
2429
2430 =cut
2431
2432 sub _push_page_stack {
2433     my $self = shift;
2434
2435     # Don't push anything if it's a virgin object
2436     if ( $self->{res} && $self->stack_depth ) {
2437         my $save_stack = $self->{page_stack};
2438         $self->{page_stack} = [];
2439
2440         my $clone = $self->clone;
2441         push( @{$save_stack}, $clone );
2442
2443         while ( @{$save_stack} > $self->stack_depth ) {
2444             shift @{$save_stack};
2445         }
2446         $self->{page_stack} = $save_stack;
2447     }
2448
2449     return 1;
2450 }
2451
2452 sub _pop_page_stack {
2453     my $self = shift;
2454
2455     if ( $self->{page_stack} && @{$self->{page_stack}} ) {
2456         my $popped = pop @{$self->{page_stack}};
2457
2458         # eliminate everything in self
2459         foreach my $key ( keys %{$self} ) {
2460             delete $self->{ $key }              unless $key eq 'page_stack';
2461         }
2462
2463         # make self just like the popped object
2464         foreach my $key ( keys %{$popped} ) {
2465             $self->{ $key } = $popped->{ $key } unless $key eq 'page_stack';
2466         }
2467     }
2468
2469     return 1;
2470 }
2471
2472 =head2 warn( @messages )
2473
2474 Centralized warning method, for diagnostics and non-fatal problems.
2475 Defaults to calling C<CORE::warn>, but may be overridden by setting
2476 C<onwarn> in the constructor.
2477
2478 =cut
2479
2480 sub warn {
2481     my $self = shift;
2482
2483     return unless my $handler = $self->{onwarn};
2484
2485     return if $self->quiet;
2486
2487     return $handler->(@_);
2488 }
2489
2490 =head2 die( @messages )
2491
2492 Centralized error method.  Defaults to calling C<CORE::die>, but
2493 may be overridden by setting C<onerror> in the constructor.
2494
2495 =cut
2496
2497 sub die {
2498     my $self = shift;
2499
2500     return unless my $handler = $self->{onerror};
2501
2502     return $handler->(@_);
2503 }
2504
2505
2506 # NOT an object method!
2507 sub _warn {
2508     require Carp;
2509     return &Carp::carp; ## no critic
2510 }
2511
2512 # NOT an object method!
2513 sub _die {
2514     require Carp;
2515     return &Carp::croak; ## no critic
2516 }
2517
2518 1; # End of module
2519
2520 __END__
2521
2522 =head1 REQUESTS & BUGS
2523
2524 The bug queue for WWW::Mechanize and Test::WWW::Mechanize is at
2525 L<http://code.google.com/p/www-mechanize/issues/list>.  Please do
2526 not add any tickets to the old queue at L<http://rt.cpan.org/>.
2527
2528 =head1 WWW::MECHANIZE'S SUBVERSION REPOSITORY
2529
2530 Mech and Test::WWW::Mechanize are both hosted at Google Code:
2531 http://code.google.com/p/www-mechanize/.  The Subversion repository
2532 is at http://www-mechanize.googlecode.com/svn/wm/.
2533
2534 =head1 OTHER DOCUMENTATION
2535
2536 =head2 I<Spidering Hacks>, by Kevin Hemenway and Tara Calishain
2537
2538 I<Spidering Hacks> from O'Reilly
2539 (L<http://www.oreilly.com/catalog/spiderhks/>) is a great book for anyone
2540 wanting to know more about screen-scraping and spidering.
2541
2542 There are six hacks that use Mech or a Mech derivative:
2543
2544 =over 4
2545
2546 =item #21 WWW::Mechanize 101
2547
2548 =item #22 Scraping with WWW::Mechanize
2549
2550 =item #36 Downloading Images from Webshots
2551
2552 =item #44 Archiving Yahoo! Groups Messages with WWW::Yahoo::Groups
2553
2554 =item #64 Super Author Searching
2555
2556 =item #73 Scraping TV Listings
2557
2558 =back
2559
2560 The book was also positively reviewed on Slashdot:
2561 L<http://books.slashdot.org/article.pl?sid=03/12/11/2126256>
2562
2563 =head1 ONLINE RESOURCES AND SUPPORT
2564
2565 =over 4
2566
2567 =item * WWW::Mechanize mailing list
2568
2569 The Mech mailing list is at
2570 L<http://groups.google.com/group/www-mechanize-users> and is specific
2571 to Mechanize, unlike the LWP mailing list below.  Although it is a
2572 users list, all development discussion takes place here, too.
2573
2574 =item * LWP mailing list
2575
2576 The LWP mailing list is at
2577 L<http://lists.perl.org/showlist.cgi?name=libwww>, and is more
2578 user-oriented and well-populated than the WWW::Mechanize list.
2579
2580 =item * Perlmonks
2581
2582 L<http://perlmonks.org> is an excellent community of support, and
2583 many questions about Mech have already been answered there.
2584
2585 =item * L<WWW::Mechanize::Examples>
2586
2587 A random array of examples submitted by users, included with the
2588 Mechanize distribution.
2589
2590 =back
2591
2592 =head1 ARTICLES ABOUT WWW::MECHANIZE
2593
2594 =over 4
2595
2596 =item * L<http://www-128.ibm.com/developerworks/linux/library/wa-perlsecure.html>
2597
2598 IBM article "Secure Web site access with Perl"
2599
2600 =item * L<http://www.oreilly.com/catalog/googlehks2/chapter/hack84.pdf>
2601
2602 Leland Johnson's hack #84 in I<Google Hacks, 2nd Edition> is
2603 an example of a production script that uses WWW::Mechanize and
2604 HTML::TableContentParser. It takes in keywords and returns the estimated
2605 price of these keywords on Google's AdWords program.
2606
2607 =item * L<http://www.perl.com/pub/a/2004/06/04/recorder.html>
2608
2609 Linda Julien writes about using HTTP::Recorder to create WWW::Mechanize
2610 scripts.
2611
2612 =item * L<http://www.developer.com/lang/other/article.php/3454041>
2613
2614 Jason Gilmore's article on using WWW::Mechanize for scraping sales
2615 information from Amazon and eBay.
2616
2617 =item * L<http://www.perl.com/pub/a/2003/01/22/mechanize.html>
2618
2619 Chris Ball's article about using WWW::Mechanize for scraping TV
2620 listings.
2621
2622 =item * L<http://www.stonehenge.com/merlyn/LinuxMag/col47.html>
2623
2624 Randal Schwartz's article on scraping Yahoo News for images.  It's
2625 already out of date: He manually walks the list of links hunting
2626 for matches, which wouldn't have been necessary if the C<find_link()>
2627 method existed at press time.
2628
2629 =item * L<http://www.perladvent.org/2002/16th/>
2630
2631 WWW::Mechanize on the Perl Advent Calendar, by Mark Fowler.
2632
2633 =item * L<http://www.linux-magazin.de/Artikel/ausgabe/2004/03/perl/perl.html>
2634
2635 Michael Schilli's article on Mech and L<WWW::Mechanize::Shell> for the
2636 German magazine I<Linux Magazin>.
2637
2638 =back
2639
2640 =head2 Other modules that use Mechanize
2641
2642 Here are modules that use or subclass Mechanize.  Let me know of any others:
2643
2644 =over 4
2645
2646 =item * L<Finance::Bank::LloydsTSB>
2647
2648 =item * L<HTTP::Recorder>
2649
2650 Acts as a proxy for web interaction, and then generates WWW::Mechanize scripts.
2651
2652 =item * L<Win32::IE::Mechanize>
2653
2654 Just like Mech, but using Microsoft Internet Explorer to do the work.
2655
2656 =item * L<WWW::Bugzilla>
2657
2658 =item * L<WWW::CheckSite>
2659
2660 =item * L<WWW::Google::Groups>
2661
2662 =item * L<WWW::Hotmail>
2663
2664 =item * L<WWW::Mechanize::Cached>
2665
2666 =item * L<WWW::Mechanize::FormFiller>
2667
2668 =item * L<WWW::Mechanize::Shell>
2669
2670 =item * L<WWW::Mechanize::Sleepy>
2671
2672 =item * L<WWW::Mechanize::SpamCop>
2673
2674 =item * L<WWW::Mechanize::Timed>
2675
2676 =item * L<WWW::SourceForge>
2677
2678 =item * L<WWW::Yahoo::Groups>
2679
2680 =back
2681
2682 =head1 ACKNOWLEDGEMENTS
2683
2684 Thanks to the numerous people who have helped out on WWW::Mechanize in
2685 one way or another, including
2686 Kirrily Robert for the original C<WWW::Automate>,
2687 Adriano Ferreira,
2688 Miyagawa,
2689 Peteris Krumins,
2690 Rafael Kitover,
2691 David Steinbrunner,
2692 Kevin Falcone,
2693 Mike O'Regan,
2694 Mark Stosberg,
2695 Uri Guttman,
2696 Peter Scott,
2697 Phillipe Bruhat,
2698 Ian Langworth,
2699 John Beppu,
2700 Gavin Estey,
2701 Jim Brandt,
2702 Ask Bjoern Hansen,
2703 Greg Davies,
2704 Ed Silva,
2705 Mark-Jason Dominus,
2706 Autrijus Tang,
2707 Mark Fowler,
2708 Stuart Children,
2709 Max Maischein,
2710 Meng Wong,
2711 Prakash Kailasa,
2712 Abigail,
2713 Jan Pazdziora,
2714 Dominique Quatravaux,
2715 Scott Lanning,
2716 Rob Casey,
2717 Leland Johnson,
2718 Joshua Gatcomb,
2719 Julien Beasley,
2720 Abe Timmerman,
2721 Peter Stevens,
2722 Pete Krawczyk,
2723 Tad McClellan,
2724 and the late great Iain Truskett.
2725
2726 =head1 COPYRIGHT
2727
2728 Copyright (c) 2005-2007 Andy Lester. All rights reserved. This program is
2729 free software; you can redistribute it and/or modify it under the same
2730 terms as Perl itself.
2731
2732 =cut