Commit 766b23e0 by Bostjan Skufca

Initial code commit

parent 48785996
composer.phar ### Composer-specific paths
vendor/ #
/composer.phar
/vendor/
# Commit your application's lock file http://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file
# You may choose to ignore a library lock file http://getcomposer.org/doc/02-libraries.md#lock-file
# composer.lock ### Sitemap-specific paths
#
/sitemap.yaml
# php-brute-force-sitemap-generator # PHP Brute Force Sitemap Generator
Generate sitemaps by crawling your website for static pages and using hooks for dynamic content
Generate sitemaps by crawling your website for static pages and using hooks for
dynamic content. Intermediate sitemap URI list is stored in relative format and
served dynamically by generating final documents where relative URIs are
prefixed with configured base URI, resulting in final document that contains
full URIs.
Features:
* crawl your website for static content and generate list of URIs;
* seed crawler with existing URI list, or add URIs manually and recrawl
to avoid missed content in the future;
* store URIs in relative format;
* when serving sitemaps, convert relative URIs to absolute form with
configured prefix URI.
## Target users
Sitemaps are generally best served as accurate as possible, and this means that
your application needs to have infrastructure prepared for enumerating all
content it serves. Many applications do not support this, or support it only
partially.
Users that are stuck using such applications and who have to provide sitemaps
are usually left with the option of pre-generating sitemaps using public web
crawlers. This results in inaccurate and stale sitemaps.
This is where Brute Force Sitemap Generator (BFSG) steps in.
## Modes of operation
Definition of terms:
* **base URI**: URI under which sitemap will reside, i.e. https://example.com/ (without trailing "sitemap.*)
* **transData**: It stands for "transitional data" and represents sitemap data that do not
contain absolute URIs. Absotule URIs are generated at the very last stage,
where HTTP request for sitemap triggers generation of final sitemap by prefixing
all relative URIs with base URI prefix which is obtained dynamically.
BFSG implements the following operations:
* create transData by crawling existing website
** crawling may be seeded by base URI only
** may be seeded by existing transData (list of URIs that were previously encountered)
* augment transData generated by crawler by using callback (for dynamically generated pages)
* using transData cache to generate and output final sitemap.(xml|txt)(.gz)?
BFSG can be glued to your application in the following ways:
1. add BFSG to your project as git submodule:
** you need to create sitemap-glue.php file that returns needed configuration details from your project
** sitemap-glue.php must reside on the same path level as main BFSG directory (just outside of BFSG source tree)
** (reasoning for this is that you will want to commit your glue code to your project repository instead to BFSG's git repo)
1. install BFSG with composer - TODO
1. Symfony: add BFSG as bundle - TODO
## License
BFSG is released under MIT license. See LICENSE file at the root of repository for
additional info.
## Credits
Brute Force Sitemap Generator was created and is maintained by Bostjan Skufca & Teon d.o.o company.
<?php
/**
* Import classes
*/
use Teon\Sitemap\Generator\BruteForce\Config;
/**
* Engage composer
*/
require __DIR__.'/../vendor/autoload.php';
/**
* Try to load configuration from integration file
*/
$integrationFilePath = __DIR__ .'/../../sitemap-glue.php';
if (file_exists($integrationFilePath)) {
$configData = require $integrationFilePath;
Config::setConfig($configData);
}
#!/usr/bin/env php
<?php
/**
* Import classes
*/
use Symfony\Component\Console\Application;
use Teon\Sitemap\Generator\BruteForce\Command\CrawlCommand;
use Teon\Sitemap\Generator\BruteForce\Command\GenerateTxtCommand;
use Teon\Sitemap\Generator\BruteForce\Command\GenerateXmlCommand;
/**
* Bootstrap first
*/
require __DIR__.'/../app/bootstrap.php';
/**
* Start the application
*/
$application = new Application();
$application->add(new CrawlCommand());
$application->add(new GenerateTxtCommand());
$application->add(new GenerateXmlCommand());
$application->run();
#!/bin/bash
### Init shell
#
cd `dirname $0`
### Check if command exist
#
CMD="$1"
if [ "$CMD" != "" ]; then
if ./console list sitemap | grep '^ sitemap:' | sed -e 's/^ sitemap://' | awk '{print $1}' | grep -c "$CMD" > /dev/null; then
./console sitemap:$@
else
./console list sitemap
echo
echo "ERROR: Unknown sitemap:... command, see help above or use ./console instead"
echo
fi
else
./console list sitemap
fi
{
"name": "teon/brute-force-sitemap-generator",
"description": "Generate sitemaps by crawling your website for static pages and using hooks for dynamic content",
"type": "library",
"license": "MIT",
"authors": [
{
"name" : "Teon d.o.o",
"email" : "opensource@teon.si",
"homepage" : "http://teon.si"
}
],
"autoload": {
"psr-4": {
"Teon\\Sitemap\\Generator\\BruteForce\\": "src/"
}
},
"require": {
"ext-simplexml": "*",
"ext-dom": "*",
"symfony/console": "^2.7",
"symfony/http-foundation": "^2.7",
"symfony/yaml": "^2.7",
"zendframework/zend-http": "^2.5"
}
}
This diff is collapsed. Click to expand it.
###
### Brute Force Sitemap Generator
###
#
# This is a sample .htaccess file that redirects requests for sitemap files
# to sitemap.php provided by Brute Force Sitemap Generator.
#
# By default, it ignores existing sitemap files.
#
RewriteEngine on
RewriteRule ^/?sitemap\.(txt|xml)(\.gz)?$ /sitemap.php
<?php
/**
* Import classes
*/
//use Symfony\Component\Console\Application;
use Teon\Sitemap\Generator\BruteForce\Generator;
/**
* Bootstrap first
*/
require __DIR__.'/../app/bootstrap.php';
/**
* Decide which format to throw out
*/
$requestUri = $_SERVER['REQUEST_URI'];
// Detect format
if (!preg_match('#/sitemap\.([^.]+)(\.gz)?$#', $requestUri, $m)) {
throw new Exception("Unsupported sitemap format: $requestUri");
}
$format = $m[1];
// Detect compression
$compressed = false;
if (preg_match('#\.gz$#', $requestUri)) {
$compressed = true;
}
/*
* Generate in appropriate format
*/
$Generator = new Generator();
switch ($format) {
case "txt":
$contentType = "application/xml";
$sitemapData = $Generator->generateTxt();
break;
case "xml":
$contentType = "text/plain";
$sitemapData = $Generator->generateXml();
break;
default:
throw new Exception("Unsupported sitemap format: $format");
}
/**
* Manage output
*/
if ($compressed) {
header('Content-Type: application/gzip');
$sitemapDataCompressed = gzencode($sitemapData);
echo $sitemapDataCompressed;
} else {
header("Content-Type: $contentType");
echo $sitemapData;
}
<?php
// Init your application
// ...
// Get current baseUri for your application instance
// either from database or from webserver environment ($_SERVER)
// ...
// Return BF Sitemap Generator configuration
return array(
/*
* SETTING:
* baseUri
*
* Base URI of your application, used for two things:
* - for generating absolute URIs in final sitemap.xml
* - default initial crawl url
*/
'baseUri' => $baseUri,
/*
* SETTING:
* ignoreRegex
*
* Regular expression of URIs to ignore. Must include leading and trailing '/' character.
* Leave as null if you do not use this feature.
*
* Example:
* '/language=/',
*/
'ignoreRegex' => NULL,
/*
* SETTING:
* transDataFile
* transDataFileNew
*
* Transitional (cached) data from which final sitemap is generated.
* It is also used as URI list seed for crawler.
* The "*New" setting is the default destination file for crawler in order
* to avoid loosing already-generated transData.
*/
'transDataFile' => __DIR__ .'/../path/to/app/config/sitemap-transData.yaml',
'transDataFileNew' => __DIR__ .'/../path/to/app/config/sitemap-transData.yaml.new',
);
<?php
namespace Teon\Sitemap\Generator\BruteForce\Command;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Teon\Sitemap\Generator\BruteForce\Config;
use Teon\Sitemap\Generator\BruteForce\Crawler;
use Teon\Sitemap\Generator\BruteForce\Uri;
use Teon\Sitemap\Generator\BruteForce\UriList;
class CrawlCommand extends Command
{
/**
* Command configuration
*/
protected function configure ()
{
$this
->setName('sitemap:crawl')
->setDescription('Crawl configured domain and generate URL list')
->addOption(
'base-uri',
'b',
InputOption::VALUE_OPTIONAL,
'Base URI to start crawling at',
Config::get('baseUri')
)
->addOption(
'seed-file',
's',
InputOption::VALUE_OPTIONAL,
'Existing list of URIs to add to to-crawl schedule, in transData format (YAML)',
Config::get('transDataFile')
)
->addOption(
'output-file',
'o',
InputOption::VALUE_OPTIONAL,
'Location of file to store generated transData content (YAML)',
Config::get('transDataFileNew')
)
;
}
/**
* Command execution
*/
protected function execute (InputInterface $input, OutputInterface $output)
{
// Get configuration
$baseUri = $input->getOption('base-uri');
$BaseUri = new Uri($baseUri);
$seedFile = $input->getOption('seed-file');
$outputFile = $input->getOption('output-file');
// Init crawler
$Crawler = new Crawler();
$Crawler->setConsoleOutput($output);
$output->writeln("");
$output->writeln("Starting crawl:");
$output->writeln(" Base URI: ". $BaseUri->getAbsoluteUri());
if (!empty($seedFile)) {
$output->writeln(" URI seed file: $seedFile");
$transData = file_get_contents($seedFile);
$UriListSeed = new UriList($BaseUri);
$UriListSeed->loadTransDataYaml($transData);
$output->writeln(" URI seed file: loaded ". $UriListSeed->getCount() ." URIs.");
} else {
$UriListSeed = NULL;
}
$output->writeln("");
// Crawl
$UriList = $Crawler->crawlAndGenerateUriList($BaseUri, $UriListSeed);
// VISUAL OUTPUT: Display redirects
$UriList->reset();
while ($Uri = $UriList->getNext()) {
if ($Uri->isRedirect()) {
$DestUri = $Uri->getRedirectUri();
$output->writeln("Redirect: $Uri --> $DestUri");
}
continue;
}
// VISUAL OUTPUT: Display errors
$UriList->reset();
while ($Uri = $UriList->getNext()) {
if ($Uri->isError()) {
$output->writeln("ERROR: $Uri");
}
continue;
}
// Write transData cache
file_put_contents($outputFile, $UriList->generateTransDataYaml());
$output->writeln("");
$output->writeln("Transitional data file generated:");
$output->writeln(" $outputFile");
$output->writeln("");
$output->writeln("All done.");
}
}
<?php
namespace Teon\Sitemap\Generator\BruteForce\Command;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Teon\Sitemap\Generator\BruteForce\Config;
use Teon\Sitemap\Generator\BruteForce\Generator;
class GenerateTxtCommand extends Command
{
/**
* Command configuration
*/
protected function configure ()
{
$this
->setName('sitemap:generate:txt')
->setDescription('Generate sitemap.txt content from transData file (YAML')
->addOption(
'input-file',
'i',
InputOption::VALUE_OPTIONAL,
'TransData file to use',
Config::get('transDataFile')
)
->addOption(
'base-uri',
'b',
InputOption::VALUE_OPTIONAL,
'Base URI to start crawling at',
Config::get('baseUri')
)
;
}
/**
* Command execution
*/
protected function execute (InputInterface $input, OutputInterface $output)
{
Config::set('transDataFile', $input->getOption('input-file'));
Config::set('baseUri', $input->getOption('base-uri'));
$Generator = new Generator();
echo $Generator->generateTxt();
}
}
<?php
namespace Teon\Sitemap\Generator\BruteForce\Command;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Teon\Sitemap\Generator\BruteForce\Config;
use Teon\Sitemap\Generator\BruteForce\Generator;
class GenerateXmlCommand extends Command
{
/**
* Command configuration
*/
protected function configure ()
{
$this
->setName('sitemap:generate:xml')
->setDescription('Generate sitemap.xml content from sitemap-transData.yaml')
->addOption(
'input-file',
'i',
InputOption::VALUE_OPTIONAL,
'TransData file to use',
Config::get('transDataFile')
)
->addOption(
'base-uri',
'b',
InputOption::VALUE_OPTIONAL,
'Base URI to start crawling at',
Config::get('baseUri')
)
;
}
/**
* Command execution
*/
protected function execute (InputInterface $input, OutputInterface $output)
{
Config::set('transDataFile', $input->getOption('input-file'));
Config::set('baseUri', $input->getOption('base-uri'));
$Generator = new Generator();
echo $Generator->generateXml();
}
}
<?php
namespace Teon\Sitemap\Generator\BruteForce;
class Config
{
/*
* Config data array
*/
protected static $configData = false;
/**
* Set config data
*/
public static function setConfig (array $configData)
{
self::initialize();
self::$configData = $configData;
}
/**
* Set config data
*/
public static function set ($setting, $value)
{
self::initialize();
if (!isset(self::$configData[$setting])) {
throw new Exception("Unknown configuration setting: $setting");
}
self::$configData[$setting] = $value;
}
/**
* Initialize config with default values
*/
public static function initialize ()
{
if (false == self::$configData) {
self::$configData = array(
'baseUri' => "https://FIXME/",
'transDataFile' => __DIR__ .'/../sitemap-transData.yaml',
'transDataFileNew' => __DIR__ .'/../sitemap-transData.yaml.new',
);
}
}
/**
* Get setting
*/
public static function get ($setting)
{
self::initialize();
if (!isset(self::$configData[$setting])) {
throw new Exception("Unknown configuration setting: $setting");
}
return self::$configData[$setting];
}
/**
* Get (sth)
*/
public static function getBaseUri ()
{
self::initialize();
return self::get("baseUri");
}
/**
* Get (sth)
*/
public static function getTransDataFile ()
{
self::initialize();
return self::get("transDataFile");
}
/**
* Get (sth)
*/
public static function getTransDataFileNew ()
{
self::initialize();
return self::get("transDataFileNew");
}
}
<?php
namespace Teon\Sitemap\Generator\BruteForce;
# use Teon\Sitemap\Generator\BruteForce\Crawler;
use Symfony\Component\Console\Output\OutputInterface;
class Crawler // extends Command
{
/*
* Console output object
*/
protected $consoleOutput = NULL;
/**
* Set console output object to use
*
*
*/
public function setConsoleOutput (OutputInterface $consoleOutput)
{
$this->consoleOutput = $consoleOutput;
}
/**
* Output to console if applicable
*
* If consoleOutput is set, use it to write output to console, otherwise silently drop it
*/
protected function consoleWriteln ($text)
{
if (NULL != $this->consoleOutput) {
$this->consoleOutput->writeln($text);
}
}
public function crawlAndGenerateUriList (Uri $BaseUri, UriList $UriListSeed = NULL)
{