Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HPCC-27051 create a sasha service to clean up post mortem files #19588

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dali/sasha/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ set ( SRCS
sautil.cpp
saverify.cpp
saxref.cpp
sadbghk.cpp
)

include_directories (
Expand Down
224 changes: 224 additions & 0 deletions dali/sasha/sadbghk.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
#include "platform.h"

#include "jlib.hpp"
#include "jiface.hpp"
#include "jptree.hpp"
#include "jregexp.hpp"

#include "dadfs.hpp"
#include "dalienv.hpp"

#include "saserver.hpp"
#include "sautil.hpp"

#define DEFAULT_EXPIRY_INTERVAL 24 // hours

#define DEFAULT_EXPIRYDAYS 7

#define LOGDBGHK "DEBUGHOUSEKEEPING: "

// Debug Housekeeping monitor

class CSashaDebugHousekeepingServer : public ISashaServer, public Thread
{
bool stopped;
Semaphore stopsem;
Mutex runmutex;
Linked<IPropertyTree> props;

public:
IMPLEMENT_IINTERFACE_USING(Thread);

CSashaDebugHousekeepingServer(IPropertyTree *_config)
: Thread("CSashaDebugHousekeepingServer"), props(_config)
{
stopped = false;

StringBuffer userName;
props->getProp("@user", userName);
}

~CSashaDebugHousekeepingServer()
{
}

void start()
{
Thread::start(false);
}

void ready()
{
}

void stop()
{
if (!stopped)
{
stopped = true;
stopsem.signal();
}
synchronized block(runmutex); // hopefully stopped should stop
if (!join(1000 * 60 * 3))
OERRLOG("CSashaDebugHousekeepingServer aborted");
}

void runDebugHousekeeping()
{
synchronized block(runmutex);
if (stopped)
return;
PROGLOG(LOGDBGHK "Started");
unsigned defaultExpireDays = props->getPropInt("@expiryDefault", DEFAULT_EXPIRYDAYS);

// get debug plane dir
StringBuffer debugDir;
StringBuffer planeName;
if (!getDefaultPlane(planeName, "@debugPlane", "debug"))
{
WARNLOG("Exception handlers configured, but debug plane is missing");
return;
}
Owned<IPropertyTree> plane = getStoragePlane(planeName);
assertex(plane);
verifyex(plane->getProp("@prefix", debugDir));

// iterate debug plane selecting post-mortem directories for housekeeping
Owned<IDirectoryIterator> pDirIter = createDirectoryIterator(debugDir.str(), "*", false, true);
ForEach(*pDirIter)
{
if (stopped)
break;

IFile &iDirFile = pDirIter->query();
const char *dirPath = iDirFile.queryFilename();

if (!dirPath || !*dirPath)
continue;

// Process directories, exclude the ".", "..", non post-mortem and not expired post-mortem directories
if (iDirFile.isDirectory() == fileBool::foundYes && *dirPath != '.')
{
// Ensure directory name only
StringBuffer dirNameOnly;
String dirNameStr(dirPath);
int fwdSlashIndex = dirNameStr.lastIndexOf('/');
if (fwdSlashIndex > -1)
{
String *tmpStr = dirNameStr.substring(fwdSlashIndex + 1);
dirNameOnly.append(*tmpStr);
delete tmpStr;
}
else
{
dirNameOnly.append(dirNameStr);
}

if (isPostMortemDirPath(dirNameOnly) && isExpiredDirPath(dirNameOnly, defaultExpireDays))
{
recursiveRemoveDirectory(dirPath);
PROGLOG(LOGDBGHK "Deleted %s", dirPath);
}
}
}
pDirIter.clear();

PROGLOG(LOGDBGHK "%s", stopped ? "Stopped" : "Done");
}

int run()
{
unsigned interval = props->getPropInt("@interval", DEFAULT_EXPIRY_INTERVAL);
if (!interval)
stopped = true;
PROGLOG(LOGDBGHK "min interval = %d hr", interval);
unsigned initinterval = (interval - 1) / 2; // wait a bit til dali has started
CSashaSchedule schedule;
if (interval)
schedule.init(props, interval, initinterval);
initinterval *= 60 * 60 * 1000; // ms
unsigned started = msTick();
while (!stopped)
{
stopsem.wait(1000 * 60);
if (stopped)
break;
if (!interval || ((started != (unsigned)-1) && (msTick() - started < initinterval)))
continue;
started = (unsigned)-1;
if (!schedule.ready())
continue;
try
{
runDebugHousekeeping();
}
catch (IException *e)
{
StringBuffer s;
EXCLOG(e, LOGDBGHK);
e->Release();
}
}
PROGLOG(LOGDBGHK "Exit");
return 0;
}

private:
bool isPostMortemDirPath(const StringBuffer &dirName)
{
// Expecting a directory name like "W20250225-101112"
RegExpr postMortemDirRegEx("^W[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]-[0-9][0-9][0-9][0-9][0-9][0-9]$");
if (postMortemDirRegEx.find(dirName.str()))
{
PROGLOG(LOGDBGHK "Post-mortem dir: %s", dirName.str());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jakesmith do we need this log?
Could it be useful as a DBGLOG ?


return true;
}
else
{
PROGLOG(LOGDBGHK "Non post-mortem dir: %s", dirName.str());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jakesmith do we need this log?
I added it in case there were folders that we not expected.


return false;
}
}

bool isExpiredDirPath(StringBuffer &dirName, const unsigned &defaultExpireDays)
{
// Directory name is like "W20250225-101112"
StringBuffer cDateTimeoprmattedString;
cDateTimeoprmattedString.appendf("%c%c%c%c-%c%c-%c%cT%c%c:%c%c:%c%c",
dirName.charAt(1), dirName.charAt(2), dirName.charAt(3), dirName.charAt(4),
dirName.charAt(5), dirName.charAt(6),
dirName.charAt(7), dirName.charAt(8),
dirName.charAt(10), dirName.charAt(11),
dirName.charAt(12), dirName.charAt(13),
dirName.charAt(14), dirName.charAt(15));

CDateTime now;
now.setNow();

CDateTime expires;
expires.setString(cDateTimeoprmattedString.str());
expires.adjustTime(60 * 24 * defaultExpireDays);

if (now.compare(expires, false) > 0)
{
PROGLOG(LOGDBGHK "Post-mortem dir: %s has expired", dirName.str());
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jakesmith is this log needed?
The dir will be deleted and logged anyway.


return true;
}
else
{
return false;
}
}

} *sashaDebugHousekeepingServer = NULL;

ISashaServer *createSashaDebugHousekeepingServer()
{
assertex(!sashaDebugHousekeepingServer); // initialization problem
Linked<IPropertyTree> config = serverConfig;
sashaDebugHousekeepingServer = new CSashaDebugHousekeepingServer(config);
return sashaDebugHousekeepingServer;
}
7 changes: 7 additions & 0 deletions dali/sasha/sadbghk.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#ifndef SADBGHKIF_HPP
#define SADBGHKIF_HPP

interface ISashaServer;
extern ISashaServer *createSashaDebugHousekeepingServer();

#endif
4 changes: 4 additions & 0 deletions dali/sasha/saserver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
#include "saarch.hpp"
#include "saverify.hpp"
#include "saxref.hpp"
#include "sadbghk.hpp"
#include "saqmon.hpp"
#include "sacoalescer.hpp"
#include "sacmd.hpp"
Expand Down Expand Up @@ -75,6 +76,7 @@ static void AddServers()
servers.append(*createSashaDaFSMonitorServer());
servers.append(*createSashaQMonitorServer());
servers.append(*createSashaFileExpiryServer());
servers.append(*createSashaDebugHousekeepingServer());
// add new servers here
}
#endif
Expand Down Expand Up @@ -435,6 +437,8 @@ int main(int argc, const char* argv[])
servers.append(*createSashaFileExpiryServer());
else if (strieq(service, "thor-qmon"))
servers.append(*createSashaQMonitorServer());
else if (strieq(service, "debug-housekeeping"))
servers.append(*createSashaDebugHousekeepingServer());
//else if (strieq(service, "xref")) // TODO
// servers.append(*createSashaXrefServer());
else
Expand Down
23 changes: 23 additions & 0 deletions docs/EN_US/ConfiguringHPCC/ConfiguringHPCC.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2417,6 +2417,29 @@ sudo -u hpcc cp /etc/HPCCSystems/source/NewEnvironment.xml /etc/HPCCSystems/envi

<?hard-pagebreak ?>

<sect3>
<title>SashaServer Process DbgHk</title>

<para>This section describes the SashaServer Process DbgHk
tab.</para>

<para>
<graphic fileref="images/CM-img17-4.jpg" vendor="configmgrSS" />
</para>

<para />

<!--configMgr-Sasha-DbgHk-Include-XXX-24-->

<para>
<xi:include href="../XMLGeneration/xml/sasha.xsd.mod.xml"
xpointer="xpointer(//*[@id='sh.t5'])"
xmlns:xi="http://www.w3.org/2001/XInclude" />
</para>
</sect3>

<?hard-pagebreak ?>

<sect3>
<title>SashaServer Process DaFileSrvMonitor</title>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ dali.sasha.coalescer
sasha.dfurecovery-archiver
sasha.dfuwu-archiver
sasha.file-expiry
sasha.debug-housekeeping
sasha.wu-archiver</programlisting></para>

<para>Notice the warning about ephemeral planes. This is because this
Expand Down
24 changes: 24 additions & 0 deletions docs/PT_BR/ConfiguringHPCC/ConfiguringHPCC.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2416,6 +2416,30 @@ sudo -u hpcc cp /etc/HPCCSystems/source/NewEnvironment.xml /etc/HPCCSystems/envi

<?hard-pagebreak ?>

<sect3>
<title>SashaServer Process DbgHk</title>

<para>Esta seção descreve a aba SashaServer Process
DbgHk</para>

<para>
<graphic fileref="images/CM-img17-4.jpg"
vendor="configmgrSS"/>
</para>

<para/>

<!--configMgr-Sasha-DbgHk-Include-XXX-24-->

<para>
<xi:include href="../XMLGeneration/xml/sasha.xsd.mod.xml"
xpointer="xpointer(//*[@id='sh.t5'])"
xmlns:xi="http://www.w3.org/2001/XInclude"/>
</para>
</sect3>

<?hard-pagebreak ?>

<sect3>
<title>SashaServer Process DaFileSrvMonitor</title>

Expand Down
2 changes: 2 additions & 0 deletions helm/examples/vault-pki-remote/values-hpcc1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ sasha:
disabled: true
file-expiry:
disabled: true
debug-housekeeping:
disabled: true

esp:
- name: eclwatch
Expand Down
2 changes: 2 additions & 0 deletions helm/examples/vault-pki-remote/values-hpcc2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ sasha:
disabled: true
file-expiry:
disabled: true
debug-housekeeping:
disabled: true

esp:
- name: eclwatch
Expand Down
2 changes: 2 additions & 0 deletions helm/hpcc/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -1715,6 +1715,8 @@ dali
dali data
{{- else if (eq "thor-qmon" .name) -}}
dali queues
{{- else if (eq "debug-housekeeping" .name) -}}
dali data
{{- else -}}
{{- $_ := fail (printf "Unknown sasha service:" .name ) -}}
{{- end -}}
Expand Down
Loading
Loading