diff --git a/web/landing/assets/controllers/syntax_highlight_controller.js b/web/landing/assets/controllers/syntax_highlight_controller.js index 3ed025399..01a00dec6 100644 --- a/web/landing/assets/controllers/syntax_highlight_controller.js +++ b/web/landing/assets/controllers/syntax_highlight_controller.js @@ -2,6 +2,7 @@ import {Controller} from '@hotwired/stimulus'; import 'highlight.js/styles/github-dark.min.css'; import php from 'highlight.js/lib/languages/php'; import shell from 'highlight.js/lib/languages/shell'; +import json from 'highlight.js/lib/languages/json'; import hljs from 'highlight.js/lib/core'; /* stimulusFetch: 'lazy' */ @@ -11,6 +12,7 @@ export default class extends Controller { hljs.registerLanguage('php', php); hljs.registerLanguage('shell', shell); + hljs.registerLanguage('json', json); } connect() diff --git a/web/landing/assets/styles/app.css b/web/landing/assets/styles/app.css index e8e497465..55aeb9423 100644 --- a/web/landing/assets/styles/app.css +++ b/web/landing/assets/styles/app.css @@ -25,3 +25,6 @@ a { @apply font-medium text-blue-600 dark:text-blue-500 hover:underline; } +code { + font-size: 0.9em; +} \ No newline at end of file diff --git a/web/landing/importmap.php b/web/landing/importmap.php index 309d0a551..33f2feaf2 100644 --- a/web/landing/importmap.php +++ b/web/landing/importmap.php @@ -50,4 +50,7 @@ 'highlight.js/lib/languages/shell' => [ 'version' => '11.9.0', ], + 'highlight.js/lib/languages/json' => [ + 'version' => '11.9.0', + ], ]; diff --git a/web/landing/src/Flow/Website/Controller/BlogController.php b/web/landing/src/Flow/Website/Controller/BlogController.php new file mode 100644 index 000000000..5a6dd5c7f --- /dev/null +++ b/web/landing/src/Flow/Website/Controller/BlogController.php @@ -0,0 +1,24 @@ +render('blog/' . $date . '/' . $slug . '/post.html.twig', [ + 'template_folder' => 'blog/' . $date . '/' . $slug, + ]); + } +} diff --git a/web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php b/web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php new file mode 100644 index 000000000..13357fe1a --- /dev/null +++ b/web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php @@ -0,0 +1,24 @@ + '2024-04-04', 'slug' => 'building-custom-extractor-google-analytics']); + + return $sources; + } +} diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json new file mode 100644 index 000000000..9ccf8aaf1 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json @@ -0,0 +1,10 @@ +{ + "name": string, + "account": string, + "displayName": string, + "propertySummaries": [ + { + object (PropertySummary) + } + ] +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php new file mode 100644 index 000000000..2fdfad31c --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php @@ -0,0 +1,30 @@ +pageSize < 1 || $this->pageSize > 200) { + throw new \Flow\ETL\Exception\InvalidArgumentException('Page size must be greater than 0 and lower than 200.'); + } + } + + public function extract(FlowContext $context): \Generator + { + // TODO + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php new file mode 100644 index 000000000..6f98ef397 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php @@ -0,0 +1,35 @@ +client->listAccountSummaries(['pageSize' => $this->pageSize]); + + /** @var AccountSummary $account */ + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + + // TODO: Implement pagination + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php new file mode 100644 index 000000000..55d0213fd --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php @@ -0,0 +1,38 @@ +client->listAccountSummaries(['pageSize' => $this->pageSize]); + + // code from previous snippet + + while ($list->getPage()->hasNextPage()) { + $list = $this->client->listAccountSummaries(['pageSize' => $this->pageSize, 'pageToken' => $list->getPage()->getNextPageToken()]); + + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + } + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php new file mode 100644 index 000000000..3331b0012 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php @@ -0,0 +1,55 @@ +pageSize < 1 || $this->pageSize > 200) { + throw new \Flow\ETL\Exception\InvalidArgumentException('Page size must be greater than 0 and lower than 200.'); + } + } + + public function extract(FlowContext $context): \Generator + { + $list = $this->client->listAccountSummaries(['pageSize' => $this->pageSize]); + + /** @var AccountSummary $account */ + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + + while ($list->getPage()->hasNextPage()) { + $list = $this->client->listAccountSummaries(['pageSize' => $this->pageSize, 'pageToken' => $list->getPage()->getNextPageToken()]); + + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + } + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php new file mode 100644 index 000000000..0e79d673e --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php @@ -0,0 +1,9 @@ +getAccount()), + str_entry('name', $accountSummary->getName()), + str_entry('displayName', $accountSummary->getDisplayName()), + list_entry( + 'propertySummaries', + array_map( + static fn(PropertySummary $propertySummary) => [ + 'property' => $propertySummary->getProperty(), + 'displayName' => $propertySummary->getDisplayName(), + 'propertyType' => $propertySummary->getPropertyType(), + 'parent' => $propertySummary->getParent(), + ], + \iterator_to_array($accountSummary->getPropertySummaries()) + ), + type_list( + type_structure( + [ + structure_element('property', type_string()), + structure_element('displayName', type_string()), + structure_element('propertyType', type_integer()), + structure_element('parent', type_string()), + ] + ) + ), + ) + ); +} diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig new file mode 100644 index 000000000..dd3284b05 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig @@ -0,0 +1,134 @@ +{% extends 'base.html.twig' %} + +{%- block title -%} + Building Custom Data Extractor - Flow PHP +{%- endblock -%} + +{%- block description -%} + Learn how to extract data from Google Analytics API using Flow PHP but also how to build a custom data extractor. +{%- endblock -%} + +{% block main %} +
+ Our goal is to extract a list of Account Summaries from Google Analytics API.
+
+ The very first step is to get familiar with the data structure of the dataset we are going to extract.
+ So let's take a look at Account Summaries API documentation.
+
+ {% include template_folder ~ '/accountSummary.json' %}
+
+ {% endapply %}
+
+ {% include template_folder ~ '/propertySummary.json' %}
+
+ {% endapply %}
+
+
+ Ok, not too bad, we have one list of structures (Property Summary is a structure) and few other not nullable fields.
+ This should give us the following schema:
+
+ {% apply escape %}{% include template_folder ~ '/schema.txt' %}{% endapply %}
+
+ {% endapply %}
+
+
+ Now that we have our schema, we can start building our custom data extractor.
+ Our next step is to figure out if there is any existing SDK that we can use to interact with Google Analytics API.
+ In this case, we are going to use Google Analytics Admin SDK.
+
+ Sometimes we might not be able to find any SDK that fits our needs, in such cases we would have to build our own HTTP client and handle the API requests manually. +
+ +
+ Let's start by preparing our Extractor class. We will call it AccountSummariesExtractor
.
+ First we need to make sure that our extractor is implementing the Flow\ETL\Extractor
interface.
+ Technically speaking, it's all we need but to make our extractor to be compatible with Flow DataFrame::limit()
function,
+ we should additionally implement Flow\ETL\Extractor\LimitableExtractor
interface.
+
+ Most of the Flow\ETL\Extractor\LimitableExtractor
logic is reusable so to avoid code duplications we are going to use Flow\ETL\Extractor\Limitable
trait in our extractor.
+
+ {% apply escape %}{% include template_folder ~ '/extractor-01.php' %}{% endapply %}
+
+ {% endapply %}
+
+
+ Our extractor boilerplate is ready, let's try to implement the extract()
method logic.
+ FlowContext $context
is a container for all parameters/services that are shared between all the stages of the ETL process.
+ We can use it to access things like EntryFactory
or Config
however we are not going to use it in this example.
+
+ Let's start by fetching the list of Account Summaries from Google Analytics API, iterating over the list and yield each Account Summary. +
+ + {% apply spaceless %} +
+ {% apply escape %}{% include template_folder ~ '/extractor-02.php' %}{% endapply %}
+
+ {% endapply %}
+
+
+ ga_account_summary_to_row
function is responsible for transforming the Account Summary structure into a row that fits our schema.
+ It could be a private method of our Extractor, however, extracting it to separated function will make our code cleaner and easier to test/use later.
+
+ {% apply escape %}{% include template_folder ~ '/ga_account_summary_to_row.php' %}{% endapply %}
+
+ {% endapply %}
+
+
+ Our final step would be to implement pagination logic, since Google Analytics API returns only up to 200 Account Summaries per page.
+ Typically, we are not going to have more than 200 Account Summaries, but it's always good to be prepared for such cases.
+
+ {% apply escape %}{% include template_folder ~ '/extractor-03.php' %}{% endapply %}
+
+ {% endapply %}
+
+
+ That's it! Our custom data extractor is ready to be used.
+ We can now use it in our ETL process to extract Account Summaries from Google Analytics API.
+
+ {% apply escape %}{% include template_folder ~ '/usage.php' %}{% endapply %}
+
+ {% endapply %}
+
+
+ We are almost done, at this point we can fetch all Account Summaries from Google Analytics API and transform them into a DataFrame.
+ Our final but also optional step would be to prepare a dsl function returning our extractor that is going to improve the readability of our ETL data processing pipeline.
+
+ {% apply escape %}{% include template_folder ~ '/usage-dsl.php' %}{% endapply %}
+
+ {% endapply %}
+