Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update blog query:: allow Boolean operators #12

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion config/db/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
const logSQL = true
const logSQL = false
const initOptions = {
query(e) {
if (logSQL) console.log(e.query)
Expand Down
6 changes: 3 additions & 3 deletions config/edit/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ exports.app_description = require('./translations.js').translations['app descrip

// apps_in_suite NEED TO BE THE NAMES OF THE DIFFERENT DBs
exports.apps_in_suite = [
{ name: 'Action Plans', key: process.env.NODE_ENV === 'production' ? 'action_plans_platform' : (process.env.DB_AP || 'ap_test_02'), baseurl: 'https://acclabs-actionlearningplans.azurewebsites.net/' },
{ name: 'Solutions Mapping', key: process.env.NODE_ENV === 'production' ? 'solutions_mapping_platform' : (process.env.DB_SM || 'sm_test_02'), baseurl: 'https://acclabs-solutionsmapping.azurewebsites.net/' },
{ name: 'Experiments', key: process.env.NODE_ENV === 'production' ? 'experiments_platform' : (process.env.DB_EXP || 'exp_test_02'), baseurl: 'https://acclabs-experiments.azurewebsites.net/' }
{ name: 'Action Plans', key: process.env.NODE_ENV === 'production' ? 'action_plans_platform' : (process.env.DB_AP || 'solutions_mapping_platform'), baseurl: 'https://acclabs-actionlearningplans.azurewebsites.net/' },
{ name: 'Solutions Mapping', key: process.env.NODE_ENV === 'production' ? 'solutions_mapping_platform' : (process.env.DB_SM || 'solutions_mapping_platform'), baseurl: 'https://acclabs-solutionsmapping.azurewebsites.net/' },
{ name: 'Experiments', key: process.env.NODE_ENV === 'production' ? 'experiments_platform' : (process.env.DB_EXP || 'solutions_mapping_platform'), baseurl: 'https://acclabs-experiments.azurewebsites.net/' }
// { name: 'Blogs', key: 'exp_test_02', baseurl: 'https://acclabs-blogs.azurewebsites.net/' },
// { name: 'Consent archive', key: 'exp_test_02', baseurl: 'https://acclabs-consent-archive.azurewebsites.net/' },
// { name: 'Buzz', key: 'exp_test_02', baseurl: 'https://acclabs-buzz.azurewebsites.net/' },
Expand Down
140 changes: 88 additions & 52 deletions routes/browse/blog/query.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// Retrieve aggregate data from the database
const { page_content_limit } = include('config/')
const { parsers } = include('routes/helpers/')

const theWhereClause = (country, type )=> {
let whereClause = '';
Expand All @@ -24,19 +25,6 @@ const theWhereClause = (country, type )=> {
return whereClause;
}

const searchTextConditionFn = (searchText) => {
let searchTextCondition = '';
if (searchText !== null && searchText !== undefined && searchText.length > 0) {
searchTextCondition = `
AND (title ~* '\\m${searchText}\\M'
OR content ~* '\\m${searchText}\\M'
OR all_html_content ~* '\\m${searchText}\\M')
`;
}

return searchTextCondition;
}

exports.blogAggQuery =`
SELECT COUNT(*) AS totalBlogs
FROM articles
Expand All @@ -58,53 +46,68 @@ exports.totalUnknownCountries = `
FROM articles
WHERE country IS NULL;
`

exports.searchBlogQuery = (searchText, page, country, type) => {
let whereClause = theWhereClause(country, type);
let values = [
page_content_limit,
(page - 1) * page_content_limit,
page,
];
const search = searchText ? parsers.regexQuery(searchText) : '';
let searchTextCondition = '';
if (searchText !== null && searchText !== undefined && searchText.length > 0) {

let textColumn = "COALESCE(content, all_html_content)";

if (searchText !== null && searchText !== undefined && searchText.length > 0) {
searchTextCondition = `
AND (title ~* ('\\m' || $3::TEXT || '\\M')
OR content ~* ('\\m' || $3::TEXT || '\\M')
OR all_html_content ~* ('\\m' || $3::TEXT || '\\M')
OR country ~* ('\\m' || $3::TEXT || '\\M'))
AND (title ~* ('\\m' || $3 || '\\M')
OR ${textColumn} ~* ('\\m' || $3 || '\\M')
OR country ~* ('\\m' || $3 || '\\M'))
`;
values.splice(2, 0, searchText);

values.splice(2, 0, search);
} else {
searchTextCondition = `
AND (article_type = 'blog'
AND (${textColumn} IS NOT NULL)
AND title IS NOT NULL)
`;
values.splice(2, 0);
}
return {

return {
text: `
WITH search_results AS (
SELECT id, url, content, country, article_type, title, posted_date, posted_date_str, language, created_at, all_html_content
FROM articles
WHERE has_lab IS TRUE
${searchTextCondition}
${whereClause}
ORDER BY posted_date DESC
LIMIT $1 OFFSET $2
),
total_count AS (
SELECT COUNT(*) AS total_records
FROM articles
WHERE has_lab IS TRUE
SELECT id, url, country, article_type, title, posted_date,
posted_date_str, language, created_at,
regexp_replace(
regexp_replace(${textColumn}, E'\\n', ' ', 'g'),
E'\\s+', ' ', 'g'
) AS content
FROM articles
WHERE has_lab IS TRUE
${searchTextCondition}
${whereClause}
)
SELECT sr.*, tc.total_records, (CEIL(tc.total_records::numeric / $1)) AS total_pages, ${searchTextCondition ? '$4' : '$3'} AS current_page
FROM search_results sr
CROSS JOIN total_count tc;
LIMIT $1
OFFSET $2;
`,
values,
};
};
}


exports.articleGroup = (searchText, country, type) => {
let whereClause = theWhereClause(country, type);
let searchTextCondition = searchTextConditionFn(searchText);
const search = searchText ? parsers.regexQuery(searchText) : '';
let searchTextCondition = '';
const values = [];
if (searchText !== null && searchText !== undefined && searchText.length > 0) {
searchTextCondition = `
AND (title ~* ('\\m' || $1 || '\\M')
OR content ~* ('\\m' || $1 || '\\M')
OR country ~* ('\\m' || $1 || '\\M'))
`;

values.push(search);
}

return {
text: `
Expand All @@ -115,13 +118,24 @@ exports.articleGroup = (searchText, country, type) => {
${whereClause}
GROUP BY article_type;
`,
values: [],
values,
};
};

exports.countryGroup = (searchText, country, type) => {
let whereClause = theWhereClause(country, type);
let searchTextCondition = searchTextConditionFn(searchText);
const search = searchText ? parsers.regexQuery(searchText) : '';
let searchTextCondition = '';
const values = [];
if (searchText !== null && searchText !== undefined && searchText.length > 0) {
searchTextCondition = `
AND (title ~* ('\\m' || $1 || '\\M')
OR content ~* ('\\m' || $1 || '\\M')
OR all_html_content ~* ('\\m' || $1 || '\\M')
OR country ~* ('\\m' || $1 || '\\M'))
`;

values.push(search);
}

return {
text: `
Expand All @@ -132,13 +146,25 @@ exports.countryGroup = (searchText, country, type) => {
${whereClause}
GROUP BY country, iso3;
`,
values: [],
values,
};
};

exports.statsQuery = (searchText, country, type) => {
let whereClause = theWhereClause(country, type);
let searchTextCondition = searchTextConditionFn(searchText);
const search = searchText ? parsers.regexQuery(searchText) : '';
let searchTextCondition = '';
const values = [];
if (searchText !== null && searchText !== undefined && searchText.length > 0) {
searchTextCondition = `
AND (title ~* ('\\m' || $1 || '\\M')
OR content ~* ('\\m' || $1 || '\\M')
OR all_html_content ~* ('\\m' || $1 || '\\M')
OR country ~* ('\\m' || $1 || '\\M'))
`;

values.push(search);
}

return {
text: `
Expand Down Expand Up @@ -176,13 +202,25 @@ exports.countryGroup = (searchText, country, type) => {
(SELECT COUNT(DISTINCT article_type) FROM total_article_type_count) AS distinct_article_type_count,
(SELECT total_records FROM total_count) AS total_records;
`,
values: [],
values,
};
};

exports.extractGeoQuery = (searchText, country, type) => {
let whereClause = theWhereClause(country, type);
let searchTextCondition = searchTextConditionFn(searchText);
const search = searchText ? parsers.regexQuery(searchText) : '';
let searchTextCondition = '';
const values = [];
if (searchText !== null && searchText !== undefined && searchText.length > 0) {
searchTextCondition = `
AND (title ~* ('\\m' || $1 || '\\M')
OR content ~* ('\\m' || $1 || '\\M')
OR all_html_content ~* ('\\m' || $1 || '\\M')
OR country ~* ('\\m' || $1 || '\\M'))
`;

values.push(search);
}

return {
text: `
Expand Down Expand Up @@ -210,8 +248,6 @@ exports.countryGroup = (searchText, country, type) => {
GROUP BY clusters.cid
ORDER BY clusters.cid;
`,
values: [],
values,
};
};


};
103 changes: 101 additions & 2 deletions routes/browse/blog/searchBlogs.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
const { searchBlogQuery } = require('./query')
const { parsers } = include('routes/helpers/')

exports.main = async kwargs => {
const conn = kwargs.connection ? kwargs.connection : DB.conn
const { req, res, baseurl, page, page_content_limit } = kwargs || {}

const searchText = req.query.search || '';
let { source, search, country, type } = req.query || {}
const searchRegex = searchText ? parsers.regexQuery(searchText) : '';

return conn.task(t => {
return t.any(searchBlogQuery(searchText, page, country, type)).then(async (results) => {
// console.log('searchRegex ', searchRegex)
// Process each row and collect results
const res = results.map((row) => {
const matchedTexts = extractContext(row.content, searchRegex);
delete row.content
delete row.all_html_content
// console.log('matchedTexts ', matchedTexts)
return { ...row, matched_texts: matchedTexts };
});

return {
searchResults : results,
searchResults : res,
page,
total_pages : results[0]?.total_pages || 0,
totalRecords : results[0]?.total_records || 0
Expand All @@ -25,4 +37,91 @@ exports.main = async kwargs => {
})

})
}
}

function extractContext(text, searchTerm) {
const regexPattern = sanitizeSearchTerm(searchTerm);
const match = text.match(regexPattern);
const words = text.split(/\s+/);

let wordIndex = 0;
let charCount = 0;
const startIndex = Math.max(0, wordIndex - 10);
const endIndex = Math.min(words.length, wordIndex + 100);

if (match) {
// Find the index of the word where the match starts
while (charCount < match.index) {
charCount += words[wordIndex].length + 1; // +1 for space
wordIndex++;
}
}

const contextBefore = words.slice(startIndex, wordIndex).join(' ');
const contextAfter = words.slice(wordIndex, endIndex).join(' ');
return contextBefore + ' ' + contextAfter;
}


function sanitizeSearchTerm(searchTerm) {
// Split the search term into individual words or phrases
const terms = searchTerm.split('|');

// Sanitize each word or phrase
const sanitizedTerms = terms.map(term => {
// Remove invalid escape characters
const termWithoutInvalidEscapes = term.replace(/\\(?![b])/g, '');

// Replace \m and \M with \b
return termWithoutInvalidEscapes.replace(/m|M/g, '\\b');
});

// Join the sanitized words or phrases back into a single string
const sanitizedSearchTerm = sanitizedTerms.join('|');

// Create regular expression
const regexPattern = new RegExp(`${sanitizedSearchTerm}`, 'i');
return regexPattern;
}





const documents = _data => {
const { embeddings, documents, filters, language } = _data

let b = '\\b' // WORD BOUNDARIES
let B = '\\B'
if (language === 'AR') {
b = '(^|[^\u0621-\u064A])'
B = '($|[^\u0621-\u064A])'
}

let terms = []
if (embeddings) terms = embeddings.flat().unique().filter(d => d.charAt(0) !== '-')
let filteredTerms = []
if (filters) filteredTerms = filters.flat().unique().filter(d => d.charAt(0) !== '-')

return documents.map(d => {
d.matches = []
terms.forEach(c => {
let match
if (c.indexOf('*') !== -1) match = d.text.match(new RegExp(`${b}(\#)?${c.trim()}(\\w+)?`, 'gi'))
else match = d.text.match(new RegExp(`${b}(\#)?${c.trim()}(\\S+)?`, 'gi'))
if (match) match.unique().forEach(b => d.text = d.text.replace(b, `<span class='highlight-term'>${b}</span>`))
d.matches.push({ term: c, count: (match || []).length })
})
filteredTerms.forEach(c => {
let match
if (c.indexOf('*') !== -1) match = d.text.match(new RegExp(`${b}(\#)?${c.trim()}(\\w+)?`, 'gi'))
else match = d.text.match(new RegExp(`${b}(\#)?${c.trim()}(\\S+)?`, 'gi'))
if (match) match.forEach(b => d.text = d.text.replace(b, `<span class='highlight-filtered-term'>${b}</span>`))
})
if (d.tags) d.tags = d.tags.map(c => JSON.parse(c))
return d
})
}



26 changes: 1 addition & 25 deletions views/browse/blogs/index.ejs
Original file line number Diff line number Diff line change
Expand Up @@ -152,31 +152,7 @@
</div>
<div class="media media-txt" style="margin-top: 20px; white-space: inherit !important;">
<a class="pad-link" target="_blank" style="width: 100% !important; ">
<%
function findMatchingText(largeText) {

largeText.trim().replace(/^\n+|\n+$/g, '');

var searchText = locals?.metadata?.page?.query?.['search'] || ""

var escapedSearchText = searchText.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
var pattern = new RegExp(`[^.!?]*${escapedSearchText}[^.!?]*[.!?]`, 'g');
var matches = largeText.match(pattern);

if (matches && matches.length > 0) {
var firstSentence = matches[0];
var secondSentence = matches[1];
var lastSentence = matches[matches.length - 1];
return firstSentence + " " + `${secondSentence || ''}`;
} else {
return largeText?.split(' ')?.slice(0, 100)?.join(' ') || null;
}
}

var extractedText = findMatchingText(result?.content || result?.all_html_content || '');
%>

<%= extractedText || 'We are unable to extract content from the original website. Please click to read content on the original website.' %> ... <a target="_blank" href="<%= result.url %>" style="cursor: pointer; color: cornflowerblue;">read more</a>
<%= result?.matched_texts || 'We are unable to extract content from the original website. Please click to read content on the original website.' %> ... <a target="_blank" href="<%= result.url %>" style="cursor: pointer; color: cornflowerblue;">read more</a>
</a>
</div>
<div class="meta tag-group">
Expand Down