Skip to content

Commit e84e827

Browse files
Merge pull request #19 from menloresearch/update-dev-from-master-2025-03-21-03-16
Sync master with upstream release b4932
2 parents 81ae8aa + 9ffcc9e commit e84e827

File tree

8 files changed

+146
-96
lines changed

8 files changed

+146
-96
lines changed

docs/backend/SYCL.md

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,15 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
237237
cmake --build buildWithCublas --config Release
238238
```
239239

240+
**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
241+
242+
```sh
243+
git clone https://github.com/oneapi-src/oneDNN.git
244+
cd oneDNN
245+
cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
246+
cmake --build build-nvidia --config Release
247+
```
248+
240249
- **Adding support to AMD GPUs**
241250

242251
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
@@ -327,10 +336,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
327336
GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
328337

329338
# Option 1: Use FP32 (recommended for better performance in most cases)
330-
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
339+
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
331340

332341
# Option 2: Use FP16
333-
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
342+
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
334343

335344
# build all binary
336345
cmake --build build --config Release -j -v

examples/server/public/index.html.gz

90 Bytes
Binary file not shown.

examples/server/webui/src/components/ChatScreen.tsx

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,9 @@ export default function ChatScreen() {
9999
canvasData,
100100
replaceMessageAndGenerate,
101101
} = useAppContext();
102-
const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
103-
const inputRef = useRef<HTMLTextAreaElement>(null);
102+
const textarea = useOptimizedTextarea(prefilledMsg.content());
104103

105-
const { extraContext, clearExtraContext } = useVSCodeContext(
106-
inputRef,
107-
setInputMsg
108-
);
104+
const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
109105
// TODO: improve this when we have "upload file" feature
110106
const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;
111107

@@ -135,9 +131,10 @@ export default function ChatScreen() {
135131
};
136132

137133
const sendNewMessage = async () => {
138-
if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
139-
const lastInpMsg = inputMsg;
140-
setInputMsg('');
134+
const lastInpMsg = textarea.value();
135+
if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
136+
return;
137+
textarea.setValue('');
141138
scrollToBottom(false);
142139
setCurrNodeId(-1);
143140
// get the last message node
@@ -146,13 +143,13 @@ export default function ChatScreen() {
146143
!(await sendMessage(
147144
currConvId,
148145
lastMsgNodeId,
149-
inputMsg,
146+
lastInpMsg,
150147
currExtra,
151148
onChunk
152149
))
153150
) {
154151
// restore the input message if failed
155-
setInputMsg(lastInpMsg);
152+
textarea.setValue(lastInpMsg);
156153
}
157154
// OK
158155
clearExtraContext();
@@ -195,16 +192,13 @@ export default function ChatScreen() {
195192
// send the prefilled message if needed
196193
sendNewMessage();
197194
} else {
198-
// otherwise, focus on the input and move the cursor to the end
199-
if (inputRef.current) {
200-
inputRef.current.focus();
201-
inputRef.current.selectionStart = inputRef.current.value.length;
202-
}
195+
// otherwise, focus on the input
196+
textarea.focus();
203197
}
204198
prefilledMsg.clear();
205199
// no need to keep track of sendNewMessage
206200
// eslint-disable-next-line react-hooks/exhaustive-deps
207-
}, [inputRef]);
201+
}, [textarea.ref]);
208202

209203
// due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
210204
const pendingMsgDisplay: MessageDisplay[] =
@@ -258,9 +252,7 @@ export default function ChatScreen() {
258252
<textarea
259253
className="textarea textarea-bordered w-full"
260254
placeholder="Type a message (Shift+Enter to add a new line)"
261-
ref={inputRef}
262-
value={inputMsg}
263-
onChange={(e) => setInputMsg(e.target.value)}
255+
ref={textarea.ref}
264256
onKeyDown={(e) => {
265257
if (e.nativeEvent.isComposing || e.keyCode === 229) return;
266258
if (e.key === 'Enter' && e.shiftKey) return;
@@ -280,11 +272,7 @@ export default function ChatScreen() {
280272
Stop
281273
</button>
282274
) : (
283-
<button
284-
className="btn btn-primary ml-2"
285-
onClick={sendNewMessage}
286-
disabled={inputMsg.trim().length === 0}
287-
>
275+
<button className="btn btn-primary ml-2" onClick={sendNewMessage}>
288276
Send
289277
</button>
290278
)}
@@ -298,3 +286,43 @@ export default function ChatScreen() {
298286
</div>
299287
);
300288
}
289+
290+
export interface OptimizedTextareaValue {
291+
value: () => string;
292+
setValue: (value: string) => void;
293+
focus: () => void;
294+
ref: React.RefObject<HTMLTextAreaElement>;
295+
}
296+
297+
// This is a workaround to prevent the textarea from re-rendering when the inner content changes
298+
// See https://github.com/ggml-org/llama.cpp/pull/12299
299+
function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
300+
const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
301+
const textareaRef = useRef<HTMLTextAreaElement>(null);
302+
303+
useEffect(() => {
304+
if (textareaRef.current && savedInitValue) {
305+
textareaRef.current.value = savedInitValue;
306+
setSavedInitValue('');
307+
}
308+
}, [textareaRef, savedInitValue, setSavedInitValue]);
309+
310+
return {
311+
value: () => {
312+
return textareaRef.current?.value ?? savedInitValue;
313+
},
314+
setValue: (value: string) => {
315+
if (textareaRef.current) {
316+
textareaRef.current.value = value;
317+
}
318+
},
319+
focus: () => {
320+
if (textareaRef.current) {
321+
// focus and move the cursor to the end
322+
textareaRef.current.focus();
323+
textareaRef.current.selectionStart = textareaRef.current.value.length;
324+
}
325+
},
326+
ref: textareaRef,
327+
};
328+
}

examples/server/webui/src/utils/llama-vscode.ts

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { useEffect, useState } from 'react';
22
import { MessageExtraContext } from './types';
3+
import { OptimizedTextareaValue } from '../components/ChatScreen';
34

45
// Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
56
// Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -14,10 +15,7 @@ interface SetTextEvData {
1415
* window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n return 123' }, '*');
1516
*/
1617

17-
export const useVSCodeContext = (
18-
inputRef: React.RefObject<HTMLTextAreaElement>,
19-
setInputMsg: (text: string) => void
20-
) => {
18+
export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
2119
const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
2220
null
2321
);
@@ -27,20 +25,20 @@ export const useVSCodeContext = (
2725
const handleMessage = (event: MessageEvent) => {
2826
if (event.data?.command === 'setText') {
2927
const data: SetTextEvData = event.data;
30-
setInputMsg(data?.text);
28+
textarea.setValue(data?.text);
3129
if (data?.context && data.context.length > 0) {
3230
setExtraContext({
3331
type: 'context',
3432
content: data.context,
3533
});
3634
}
37-
inputRef.current?.focus();
35+
textarea.focus();
3836
}
3937
};
4038

4139
window.addEventListener('message', handleMessage);
4240
return () => window.removeEventListener('message', handleMessage);
43-
}, [inputRef, setInputMsg]);
41+
}, [textarea]);
4442

4543
// Add a keydown listener that sends the "escapePressed" message to the parent window
4644
useEffect(() => {

ggml/src/ggml-sycl/CMakeLists.txt

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,38 @@ ggml_add_backend_library(ggml-sycl
2323
../../include/ggml-sycl.h
2424
)
2525

26+
find_package(DNNL)
27+
set(GGML_SYCL_DNNL 0)
28+
if(DNNL_FOUND)
29+
if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
30+
# Assuming oneDNN packaged with oneapi release is used which
31+
# supports only intel target
32+
set(DNNL_GPU_VENDOR "INTEL")
33+
if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
34+
message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
35+
endif()
36+
endif()
37+
38+
# Verify oneDNN was compiled for the same target as llama
39+
if("${GGML_SYCL_TARGET}" STREQUAL "${DNNL_GPU_VENDOR}")
40+
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
41+
set(GGML_SYCL_DNNL 1)
42+
get_target_property(CONFIGS DNNL::dnnl IMPORTED_CONFIGURATIONS)
43+
foreach(CONFIG ${CONFIGS})
44+
get_target_property(DNNL_LIB DNNL::dnnl IMPORTED_LOCATION_${CONFIG})
45+
message(STATUS "Found oneDNN: ${DNNL_LIB}")
46+
endforeach()
47+
else()
48+
message(WARNING
49+
"oneDNN must be compiled for the same target as llama.cpp.
50+
llama.cpp: ${GGML_SYCL_TARGET}, oneDNN: ${DNNL_GPU_VENDOR}.
51+
Disabling oneDNN support.")
52+
endif()
53+
else()
54+
message(STATUS "oneDNN not found, disabling oneDNN support")
55+
endif()
56+
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_DNNL=${GGML_SYCL_DNNL})
57+
2658
if (GGML_SYCL_F16)
2759
if (GGML_SYCL_TARGET STREQUAL "AMD")
2860
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
@@ -48,18 +80,6 @@ file(GLOB GGML_HEADERS_SYCL "*.hpp")
4880
file(GLOB GGML_SOURCES_SYCL "*.cpp")
4981
target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
5082

51-
find_package(DNNL)
52-
message("-- DNNL found:" ${DNNL_FOUND})
53-
54-
if (GGML_SYCL_TARGET STREQUAL "INTEL")
55-
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
56-
else()
57-
add_compile_definitions(GGML_SYCL_DNNL=0)
58-
endif()
59-
60-
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
61-
target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
62-
endif()
6383

6484
if (WIN32)
6585
find_package(IntelSYCL REQUIRED)

ggml/src/ggml-sycl/common.hpp

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ static size_t g_scratch_offset = 0;
170170
int get_current_device_id();
171171

172172
inline dpct::err0 ggml_sycl_set_device(const int device) try {
173-
174173
int current_device_id;
175174
SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id()));
176175

@@ -242,6 +241,14 @@ struct ggml_sycl_pool_alloc {
242241
}
243242
}
244243

244+
T * realloc(size_t size) {
245+
GGML_ASSERT(pool != nullptr);
246+
if (ptr)
247+
pool->free(ptr, actual_size);
248+
ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
249+
return ptr;
250+
}
251+
245252
// size is in number of elements
246253
T * alloc(size_t size) {
247254
GGML_ASSERT(pool != nullptr);
@@ -371,10 +378,29 @@ struct ggml_backend_sycl_context {
371378
dnnl::stream stream_dnnl() {
372379
return stream_dnnl(device, 0);
373380
}
381+
dnnl::memory get_scratchpad_mem(const dnnl::memory::desc & scratchpad_md,
382+
const dnnl::engine & eng, const queue_ptr q) {
383+
ggml_sycl_pool_alloc<uint8_t> * pool;
384+
auto it = scratchpad_map.find(q);
385+
if (it == scratchpad_map.end()) {
386+
scratchpad_map[q] = std::make_unique<ggml_sycl_pool_alloc<uint8_t>>(this->pool());
387+
pool = scratchpad_map[q].get();
388+
} else {
389+
pool = it->second.get();
390+
}
391+
392+
size_t scratchpad_size = scratchpad_md.get_size();
393+
if (scratchpad_size > pool->actual_size) {
394+
pool->realloc(scratchpad_size);
395+
}
396+
void * mem_ptr = pool->get();
397+
return dnnl::memory(scratchpad_md, eng, mem_ptr);
398+
}
374399
#endif
375400

376401
// pool
377402
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
403+
std::unordered_map<sycl::queue *, std::unique_ptr<ggml_sycl_pool_alloc<uint8_t>>> scratchpad_map;
378404

379405
std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
380406

0 commit comments

Comments
 (0)