- download website indices, 25.6 MB, https://f000.backblazeb2.com/file/malay-dataset/dumping/common-crawl/mse-index.zip
- download dumped, 9.6 GB, https://f000.backblazeb2.com/file/malay-dataset/dumping/common-crawl/feather.zip
- download cleaned pure text, 2.93 GB, https://f000.backblazeb2.com/file/malay-dataset/dumping/common-crawl/cleaned-common-crawl.txt
@misc{Malay-Dataset, We gather Bahasa Malaysia corpus!, Common Crawl,
author = {Husein, Zolkepli},
title = {Malay-Dataset},
year = {2018},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/huseinzol05/malay-dataset/tree/master/dumping/singlish-text}}
}