use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
use HTML::LinkExtractor;
use HTML::Entities;
use Data::Dumper;
use Encode;
use utf8;
# Directory search variables
$dirURL = "https://www.sedar.com/issuers/company_issuers_\%char\%_en.htm";
$replaceme = '%char%';
@alpha = ("a" .. "z");
push(@alpha, "nc"); # add NC for non-alphabetic page
# Company listing variables
$comURL = "https://www.sedar.com/DisplayProfile.do?lang=\%lang\%&issuerType=03&issuerNo=\%issuerNo\%";
$issuerNoSub = '%issuerNo%';
$langSub = '%lang%';
@langArray = ("EN", "FR");
# Misc variables
my %ref_table;
# Output files
open ( CVSFILE, ">sedar_pipe.txt" ) || die ("you die now!");
open ( WIKITXT, ">wiki.txt" ) || die ("you die now!");
print CVSFILE "symbol|name_en|name_fr|industry|website|address|formed\n";
print WIKITXT "{| class=\"wikitable sortable\"\n!Symbole !! Nom_en !! Nom_fr || Lien\n|-\n";
# Cycle through all the directory pages
foreach $alpha (@alpha) {
# Create the URL of the directory page
(my $currenturl = $dirURL) =~s/$replaceme/$alpha/g;
# Grab the directory page
print "Grabbing page: ". $currenturl . "\n";
$contents = &getPage($currenturl);
#print "$contents\n";
# Extract all links
my $page_parser = new HTML::LinkExtractor(undef, undef, 1);
$page_parser->parse(\$contents);
# Add matching company links to hash table
for my $link( @{ $page_parser->links } ) {
#print "Found link $$link{href}\n";
if ($$link{href} =~ m/DisplayProfile.do/) {
(my $issuerNo) = ($$link{href} =~ m/issuerNo=(\d{8})/);
$reftable{$issuerNo}{'name_en'} = encode('utf8', $$link{_TEXT});
}
}
}
print "Check company page...\n";
foreach $key (sort {"\U$reftable{$a}->{'name_en'}" cmp "\U$reftable{$b}->{'name_en'}" } keys %reftable) {
# Build the english URL
(my $currenturl = $comURL) =~s/$langSub/EN/g;
$currenturl =~s/$issuerNoSub/$key/g;
$reftable{$key}{'link_en'} = $currenturl;
# Grab the page
$contents = &getPage($currenturl);
# Check if the company is active
if (ParseProfile($contents,"Reporting Jurisdictions") eq "Ceased Reporting") {
delete $reftable{$key};
next;
}
# Check if the company is on the TSX
$exchange = &ParseProfile($contents, "Stock Exchange");
if ($exchange !~ m/TSX($|,)/) {
delete $reftable{$key};
next;
}
# Grab the rest of the data
$reftable{$key}{'symbol'} = &ParseProfile($contents, "Stock Symbol");
$reftable{$key}{'address'} = &ParseProfile($contents, "Head Office Address");
$reftable{$key}{'formed'} = &ParseProfile($contents, "Date of Formation");
$reftable{$key}{'industry'} = &ParseProfile($contents, "Industry Classification");
# Symbol adjust
if (length ($reftable{$key}{'symbol'}) > 3) {
$reftable{$key}{'symbol'} = substr($reftable{$key}{'symbol'}, 0, 3) .
"." . substr($reftable{$key}{'symbol'}, 3);
}
# Guess website from email address;
$email = &ParseProfile($contents, "Business e-mail address");
if ($email =~ m/@/) {
$email =~ m/@(.*?)$/gism;
$reftable{$key}{'website'} = "http://www." . $1;
} else {
$reftable{$key}{'website'} = "";
}
# Build the french URL
(my $currenturl = $comURL) =~s/$langSub/FR/g;
$currenturl =~s/$issuerNoSub/$key/g;
# Grab the french profile
$contents = &getPage($currenturl);
# Save the french name
$contents =~ m/<STRONG>(.*?)<\/STRONG>/;
$reftable{$key}{'name_fr'} = encode('utf8', HTML::Entities::decode($1));
# Cleanup names
$reftable{$key}{'name_en'} =~ s/\s+/ /g;
$reftable{$key}{'name_en'} =~ s/ \((formerly).*//gi;
$reftable{$key}{'name_fr'} =~ s/\s+/ /g;
$reftable{$key}{'name_fr'} =~ s/ \((formerly|anciennement|auparavant).*//gi;
#Print status to screen
print $reftable{$key}{'symbol'} . "\t" . $reftable{$key}{'name_en'} . "\n";
#Print pipe delimited file
print CVSFILE $reftable{$key}{'symbol'} .
"|" . $reftable{$key}{'name_en'} .
"|" . $reftable{$key}{'name_fr'} .
"|" . $reftable{$key}{'industry'} .
"|" . $reftable{$key}{'website'} .
"|" . $reftable{$key}{'address'} .
"|" . $reftable{$key}{'formed'} . "\n";
#Print wikifile
if ($reftable{$key}{'name_fr'} ne $reftable{$key}{'name_en'}) {
print "\t{fr} " . $reftable{$key}{'name_fr'} . "\n";
print WIKITXT "|" . $reftable{$key}{'symbol'} .
"||" . $reftable{$key}{'name_en'} .
"||" . $reftable{$key}{'name_fr'} .
"|| [" . $reftable{$key}{'link_en'} . "] \n|-\n";
}
}
# Close the files
print WIKITXT "|}";
close (WIKITXT);
close (CVSFILE);
use strict;
sub ParseProfile {
# Function to parse items from the profile
my($contents, $expression) = @_;
my $extract;
$contents =~ m{$expression:</td>(.*?)</td>}gism;
$extract = $1;
$extract =~ m/>(.*?)$/gism;
$extract = $1;
return $extract;
}
use strict;
sub getPage {
# Function to get a webpage
my($currenturl) = @_;
my $contents;
my $browser = LWP::UserAgent->new();
$browser->timeout(10);
$browser->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19");
my $request = HTTP::Request->new(GET => $currenturl);
$request->header(Accept => "text/html, */*;q=0.1", referer => 'http://google.com');
my $response = $browser->request($request);
# Convert from ISO-8859 to utf8
$contents = encode("utf8",decode("ISO-8859-1",$response->content()));
return $contents;
}