From e0438b4cfc6bd8ff4cbf99987ae12c97377792f4 Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz <1921950+norberttech@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:38:35 +0200 Subject: [PATCH] How to write a custom data extractor - blog post (#1038) * How to write a custom data extractor - blog post * Added missing blog posts static content generator provider --- .../syntax_highlight_controller.js | 2 + web/landing/assets/styles/app.css | 3 + web/landing/importmap.php | 3 + .../Website/Controller/BlogController.php | 24 ++++ .../BlogPostsProvider.php | 24 ++++ .../accountSummary.json | 10 ++ .../extractor-01.php | 30 ++++ .../extractor-02.php | 35 +++++ .../extractor-03.php | 38 +++++ .../extractor-04.php | 55 +++++++ .../from_ga_account_summaries.php | 9 ++ .../ga_account_summary_to_row.php | 40 ++++++ .../post.html.twig | 134 ++++++++++++++++++ .../propertySummary.json | 6 + .../schema.txt | 5 + .../usage-dsl.php | 12 ++ .../usage.php | 24 ++++ 17 files changed, 454 insertions(+) create mode 100644 web/landing/src/Flow/Website/Controller/BlogController.php create mode 100644 web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/ga_account_summary_to_row.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/propertySummary.json create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/schema.txt create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage-dsl.php create mode 100644 web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage.php diff --git a/web/landing/assets/controllers/syntax_highlight_controller.js b/web/landing/assets/controllers/syntax_highlight_controller.js index 3ed025399..01a00dec6 100644 --- a/web/landing/assets/controllers/syntax_highlight_controller.js +++ b/web/landing/assets/controllers/syntax_highlight_controller.js @@ -2,6 +2,7 @@ import {Controller} from '@hotwired/stimulus'; import 'highlight.js/styles/github-dark.min.css'; import php from 'highlight.js/lib/languages/php'; import shell from 'highlight.js/lib/languages/shell'; +import json from 'highlight.js/lib/languages/json'; import hljs from 'highlight.js/lib/core'; /* stimulusFetch: 'lazy' */ @@ -11,6 +12,7 @@ export default class extends Controller { hljs.registerLanguage('php', php); hljs.registerLanguage('shell', shell); + hljs.registerLanguage('json', json); } connect() diff --git a/web/landing/assets/styles/app.css b/web/landing/assets/styles/app.css index e8e497465..55aeb9423 100644 --- a/web/landing/assets/styles/app.css +++ b/web/landing/assets/styles/app.css @@ -25,3 +25,6 @@ a { @apply font-medium text-blue-600 dark:text-blue-500 hover:underline; } +code { + font-size: 0.9em; +} \ No newline at end of file diff --git a/web/landing/importmap.php b/web/landing/importmap.php index 309d0a551..33f2feaf2 100644 --- a/web/landing/importmap.php +++ b/web/landing/importmap.php @@ -50,4 +50,7 @@ 'highlight.js/lib/languages/shell' => [ 'version' => '11.9.0', ], + 'highlight.js/lib/languages/json' => [ + 'version' => '11.9.0', + ], ]; diff --git a/web/landing/src/Flow/Website/Controller/BlogController.php b/web/landing/src/Flow/Website/Controller/BlogController.php new file mode 100644 index 000000000..5a6dd5c7f --- /dev/null +++ b/web/landing/src/Flow/Website/Controller/BlogController.php @@ -0,0 +1,24 @@ +render('blog/' . $date . '/' . $slug . '/post.html.twig', [ + 'template_folder' => 'blog/' . $date . '/' . $slug, + ]); + } +} diff --git a/web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php b/web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php new file mode 100644 index 000000000..13357fe1a --- /dev/null +++ b/web/landing/src/Flow/Website/StaticSourceProvider/BlogPostsProvider.php @@ -0,0 +1,24 @@ + '2024-04-04', 'slug' => 'building-custom-extractor-google-analytics']); + + return $sources; + } +} diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json new file mode 100644 index 000000000..9ccf8aaf1 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/accountSummary.json @@ -0,0 +1,10 @@ +{ + "name": string, + "account": string, + "displayName": string, + "propertySummaries": [ + { + object (PropertySummary) + } + ] +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php new file mode 100644 index 000000000..2fdfad31c --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-01.php @@ -0,0 +1,30 @@ +pageSize < 1 || $this->pageSize > 200) { + throw new \Flow\ETL\Exception\InvalidArgumentException('Page size must be greater than 0 and lower than 200.'); + } + } + + public function extract(FlowContext $context): \Generator + { + // TODO + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php new file mode 100644 index 000000000..6f98ef397 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-02.php @@ -0,0 +1,35 @@ +client->listAccountSummaries(['pageSize' => $this->pageSize]); + + /** @var AccountSummary $account */ + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + + // TODO: Implement pagination + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php new file mode 100644 index 000000000..55d0213fd --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-03.php @@ -0,0 +1,38 @@ +client->listAccountSummaries(['pageSize' => $this->pageSize]); + + // code from previous snippet + + while ($list->getPage()->hasNextPage()) { + $list = $this->client->listAccountSummaries(['pageSize' => $this->pageSize, 'pageToken' => $list->getPage()->getNextPageToken()]); + + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + } + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php new file mode 100644 index 000000000..3331b0012 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/extractor-04.php @@ -0,0 +1,55 @@ +pageSize < 1 || $this->pageSize > 200) { + throw new \Flow\ETL\Exception\InvalidArgumentException('Page size must be greater than 0 and lower than 200.'); + } + } + + public function extract(FlowContext $context): \Generator + { + $list = $this->client->listAccountSummaries(['pageSize' => $this->pageSize]); + + /** @var AccountSummary $account */ + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + + while ($list->getPage()->hasNextPage()) { + $list = $this->client->listAccountSummaries(['pageSize' => $this->pageSize, 'pageToken' => $list->getPage()->getNextPageToken()]); + + foreach ($list->iterateAllElements() as $accountSummary) { + $signal = yield rows(ga_account_summary_to_row($accountSummary)); + $this->countRow(); + + if ($signal === Signal::STOP || $this->reachedLimit()) { + return; + } + } + } + } +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php new file mode 100644 index 000000000..0e79d673e --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/from_ga_account_summaries.php @@ -0,0 +1,9 @@ +getAccount()), + str_entry('name', $accountSummary->getName()), + str_entry('displayName', $accountSummary->getDisplayName()), + list_entry( + 'propertySummaries', + array_map( + static fn(PropertySummary $propertySummary) => [ + 'property' => $propertySummary->getProperty(), + 'displayName' => $propertySummary->getDisplayName(), + 'propertyType' => $propertySummary->getPropertyType(), + 'parent' => $propertySummary->getParent(), + ], + \iterator_to_array($accountSummary->getPropertySummaries()) + ), + type_list( + type_structure( + [ + structure_element('property', type_string()), + structure_element('displayName', type_string()), + structure_element('propertyType', type_integer()), + structure_element('parent', type_string()), + ] + ) + ), + ) + ); +} diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig new file mode 100644 index 000000000..dd3284b05 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/post.html.twig @@ -0,0 +1,134 @@ +{% extends 'base.html.twig' %} + +{%- block title -%} + Building Custom Data Extractor - Flow PHP +{%- endblock -%} + +{%- block description -%} + Learn how to extract data from Google Analytics API using Flow PHP but also how to build a custom data extractor. +{%- endblock -%} + +{% block main %} +
+

Building Custom Data Extractor

+ +

+ Our goal is to extract a list of Account Summaries from Google Analytics API.
+

+

+ The very first step is to get familiar with the data structure of the dataset we are going to extract.
+ So let's take a look at Account Summaries API documentation. +

+ +

Account Summary

+ {% apply spaceless %} +
+                {% include template_folder ~ '/accountSummary.json' %}
+            
+ {% endapply %} +

Property Summary

+ {% apply spaceless %} +
+                {% include template_folder ~ '/propertySummary.json' %}
+            
+ {% endapply %} + +

+ Ok, not too bad, we have one list of structures (Property Summary is a structure) and few other not nullable fields.
+ This should give us the following schema: +

+ + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/schema.txt'  %}{% endapply %}
+            
+ {% endapply %} + +

+ Now that we have our schema, we can start building our custom data extractor.
+ Our next step is to figure out if there is any existing SDK that we can use to interact with Google Analytics API.
+ In this case, we are going to use Google Analytics Admin SDK. +

+ +

+ Sometimes we might not be able to find any SDK that fits our needs, in such cases we would have to build our own HTTP client and handle the API requests manually. +

+ +

+ Let's start by preparing our Extractor class. We will call it AccountSummariesExtractor.
+ First we need to make sure that our extractor is implementing the Flow\ETL\Extractor interface.
+ Technically speaking, it's all we need but to make our extractor to be compatible with Flow DataFrame::limit() function, + we should additionally implement Flow\ETL\Extractor\LimitableExtractor interface. +

+

+ Most of the Flow\ETL\Extractor\LimitableExtractor logic is reusable so to avoid code duplications we are going to use Flow\ETL\Extractor\Limitable trait in our extractor. +

+ + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/extractor-01.php'  %}{% endapply %}
+            
+ {% endapply %} + +

+ Our extractor boilerplate is ready, let's try to implement the extract() method logic.
+ FlowContext $context is a container for all parameters/services that are shared between all the stages of the ETL process.
+ We can use it to access things like EntryFactory or Config however we are not going to use it in this example. +

+ +

+ Let's start by fetching the list of Account Summaries from Google Analytics API, iterating over the list and yield each Account Summary. +

+ + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/extractor-02.php'  %}{% endapply %}
+            
+ {% endapply %} + +

+ ga_account_summary_to_row function is responsible for transforming the Account Summary structure into a row that fits our schema.
+ It could be a private method of our Extractor, however, extracting it to separated function will make our code cleaner and easier to test/use later. +

+ + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/ga_account_summary_to_row.php'  %}{% endapply %}
+            
+ {% endapply %} + +

+ Our final step would be to implement pagination logic, since Google Analytics API returns only up to 200 Account Summaries per page.
+ Typically, we are not going to have more than 200 Account Summaries, but it's always good to be prepared for such cases. +

+ + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/extractor-03.php'  %}{% endapply %}
+            
+ {% endapply %} + +

+ That's it! Our custom data extractor is ready to be used.
+ We can now use it in our ETL process to extract Account Summaries from Google Analytics API. +

+ + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/usage.php'  %}{% endapply %}
+            
+ {% endapply %} + +

+ We are almost done, at this point we can fetch all Account Summaries from Google Analytics API and transform them into a DataFrame.
+ Our final but also optional step would be to prepare a dsl function returning our extractor that is going to improve the readability of our ETL data processing pipeline. +

+ + + {% apply spaceless %} +
+                {% apply escape %}{% include template_folder ~ '/usage-dsl.php'  %}{% endapply %}
+            
+ {% endapply %} +
+{% endblock %} diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/propertySummary.json b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/propertySummary.json new file mode 100644 index 000000000..7f38e25d9 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/propertySummary.json @@ -0,0 +1,6 @@ +{ + "property": string, + "displayName": string, + "propertyType": enum (PropertyType), // integer + "parent": string +} \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/schema.txt b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/schema.txt new file mode 100644 index 000000000..12720aff6 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/schema.txt @@ -0,0 +1,5 @@ +schema +|-- account: string +|-- name: string +|-- displayName: string +|-- propertySummaries: list \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage-dsl.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage-dsl.php new file mode 100644 index 000000000..5cb6f94a3 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage-dsl.php @@ -0,0 +1,12 @@ +read(from_ga_account_summaries($client)) + ->limit(2) + ->collect() + ->write(to_output()) + ->run(); \ No newline at end of file diff --git a/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage.php b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage.php new file mode 100644 index 000000000..91514f9f3 --- /dev/null +++ b/web/landing/templates/blog/2024-04-04/building-custom-extractor-google-analytics/usage.php @@ -0,0 +1,24 @@ + $credentials +// ]); + +df() + ->read(new AccountSummariesExtractor($client)) + ->limit(2) + ->collect() + ->write(to_output()) + ->run(); + +// Output +// +--------------------+----------------------+--------------+----------------------+ +// | account | name | displayName | propertySummaries | +// +--------------------+----------------------+--------------+----------------------+ +// | accounts/111111111 | accountSummaries/111 | norbert.tech | [{"property":"proper | +// | accounts/222222222 | accountSummaries/222 | aeon-php | [{"property":"proper | +// +--------------------+----------------------+--------------+----------------------+