dhchenx
diff --git a/‎CITATION.cff
Lines changed: 1 addition & 1 deletion b/‎CITATION.cff
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 57 additions & 15 deletions b/‎README.md
Lines changed: 57 additions & 15 deletions
diff --git a/‎doc/README.md
Lines changed: 2 additions & 2 deletions b/‎doc/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/example_icd11_library.md
Lines changed: 1 addition & 1 deletion b/‎doc/example_icd11_library.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/images/gan-based-cross-modal-generation.jpg
743 KB b/‎doc/images/gan-based-cross-modal-generation.jpg
743 KB
diff --git a/‎doc/images/multimodal-computational-sequence.jpg
505 KB b/‎doc/images/multimodal-computational-sequence.jpg
505 KB
diff --git a/‎doc/simple_computational_seq_use.md
Lines changed: 4 additions & 2 deletions b/‎doc/simple_computational_seq_use.md
Lines changed: 4 additions & 2 deletions
diff --git a/‎doc/text_features_extraction.md
Lines changed: 1 addition & 1 deletion b/‎doc/text_features_extraction.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/video_features_extraction.md
Lines changed: 1 addition & 1 deletion b/‎doc/video_features_extraction.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mmkfeatures/transformer/README.md
Lines changed: 11 additions & 0 deletions b/‎src/mmkfeatures/transformer/README.md
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/mmkfeatures/transformer/image/build.py
Lines changed: 32 additions & 0 deletions b/‎src/mmkfeatures/transformer/image/build.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/mmkfeatures/transformer/image/config.py
Lines changed: 106 additions & 0 deletions b/‎src/mmkfeatures/transformer/image/config.py
Lines changed: 106 additions & 0 deletions
@@ -6,4 +6,4 @@ authors:
     orcid: https://orcid.org/0000-0003-1786-7551
 title: "MMKit-Features: Multimodal Features Extraction Toolkit"
 version: 0.0.1
-date-released: 2022-06-04
+date-released: 2023-05-12
@@ -1,29 +1,39 @@
 # MMKit-Features: Multimodal Feature Extraction Toolkit
 
-A light-weight Python library to utilize multimodal features for deep learning.
+Traditional knowledge graphs (KGs) are usually comprised of entities, relationships, and attributes. However, they are not designed to effectively store or represent multimodal data. This limitation prevents them from capturing and integrating information from different modes of data, such as text, images, and audio, in a meaningful and holistic way.
+
+The `MMKit-Features` project proposes a multimodal architecture to build multimodal knowledge graphs with flexible multimodal feature extraction and dynamic multimodal concept generation. 
 
 ## Project Goal
-- To extract, store, and fuse various features from multimodal datasets rapidly and efficiently;
-- To provide a common multimodal information processing framework for multimodal features; 
-- To achieve generative adversarial network-based multimodal knowledge representation dynamically. 
+- To extract, store, and fuse various multimodal features from multimodal datasets efficiently;
+- To achieve generative adversarial network(GAN)-based multimodal knowledge representation dynamically in multimodal knowledge graphs;
+- To provide a common deep learning-based architecture to enhance multimodal knowledge reasoning in real life. 
 
 ## Installation
 
+You can install this toolkit using our [PyPi](https://pypi.org/project/mmkit-features/) package. 
+
 ```
     pip install mmkit-features
 ```
 
-## Framework
+## Design Science Framework
+
+![Multimodal Computational Sequence](doc/images/multimodal-computational-sequence.jpg)
+
+Figure 1: Multimodal Computational Sequence
 
-![Design science canvas](https://dhchenx.github.io/projects/mmk-features/images/design-science-canvas.jpg)
+![GAN-based Multimodal Concept Generation](doc/images/gan-based-cross-modal-generation.jpg)
+
+Figure 2: GAN-based Multimodal Concpet Generation
 
 ## Modalities
 
 1. Text/Language modality
 2. Image modality
 3. Video modality
-4. Speech/sound modality
-5. Cross-modality between above
+4. Audio modality
+5. Cross-modality among above
 
 ## Usage
 A toy example showing how to build a multimodal feature (MMF) library is here:
@@ -34,19 +44,19 @@ from mmkfeatures.fusion.mm_features_node import MMFeaturesNode
 import numpy as np
 if __name__ == "__main__":
     # 1. create an empty multimodal features library with root and dataset names
-    feature_lib=MMFeaturesLib(root_name="test features",dataset_name="test_features")
+    feature_lib = MMFeaturesLib(root_name="test features",dataset_name = "test_features")
     # 2. set short names for each dimension for convenience
     feature_lib.set_features_name(["feature1","feature2","feature3"])
     # 3. set a  list of content IDs
-    content_ids=["content1","content2","content3"]
+    content_ids = ["content1","content2","content3"]
     # 4. according to IDs, assign a group of features with interval to corresponding content ID
-    features_dict={}
+    features_dict = {}
     for id in content_ids:
-        mmf_node=MMFeaturesNode(id)
+        mmf_node = MMFeaturesNode(id)
         mmf_node.set_item("name",str(id))
         mmf_node.set_item("features",np.array([[1,2,3]]))
         mmf_node.set_item("intervals",np.array([[0,1]]))
-        features_dict[id]=mmf_node
+        features_dict[id] = mmf_node
     # 5. set the library's data
     feature_lib.set_data(features_dict)
     # 6. save the features to disk for future use
@@ -55,10 +65,42 @@ if __name__ == "__main__":
     feature_lib.show_structure("test6_feature.csd")
     # 8. have a glance of features content within the dataset
     feature_lib.show_sample_data("test6_feature.csd")
+    # 9. Finally, we construct a simple multimodal knowledge base. 
 ```
 
 Further instructions on the toolkit refers to [here](https://github.com/dhchenx/mmkit-features/tree/main/doc). 
 
+
+## Applications
+
+Here are some examples of using our work in real life with codes and documents. 
+
+### 1. Multimodal Features Extractors
+
+- [Text Features Extraction](doc/text_features_extraction.md)
+- [Speech Features Extraction](doc/speech_features_extraction.md)
+- [Image Features Extractoin](doc/image_features_extraction.md)
+- [Video Features Extraction](doc/video_features_extraction.md)
+- [Transformer-based Features Extraction](src/mmkfeatures/transformer/README.md)
+
+### 2. Multimodal Feature Library (MMFLib)
+
+- [Basic Computational Sequence](doc/simple_computational_seq_use.md)
+- [Core use of MMFLib](doc/multimodal_features_library.md)
+
+### 3. Multimodal knowledge bases
+
+- [Multimodal Birds Feature Library](doc/example_bird_library.md)
+- [Multimodal Disease Coding Feature Library](doc/example_icd11_library.md)
+- [Multimodal ROCO Feature Library](examples/roco_lib/step1_create_lib_roco.py)
+
+### 4. Multimodal Indexing and Querying
+
+- [Brute Force Indexing](examples/birds_features_lib/step3_use_index.py)
+- [Inverted Indexing](examples/birds_features_lib/step3_use_index.py)
+- [Positional Indexing](examples/birds_features_lib/step3_use_index.py)
+- [Multimodal Indexing and querying](examples/birds_features_lib/evaluate/)
+
 ## Credits
 
 The project includes some source codes from various open-source contributors. Here is a list of their contributions. 
@@ -71,11 +113,11 @@ The project includes some source codes from various open-source contributors. He
 
 ## License
 
-This project is provided by [Donghua Chen](https://github.com/dhchenx) with MIT license. 
+The `mmkit-features` project is provided by [Donghua Chen](https://github.com/dhchenx) with MIT license. 
 
 ## Citation
 
 Please cite our project if the project is used in your research. 
 
-Chen, D. (2022). MMKit-Features: Multimodal Features Extraction Toolkit (Version 0.0.1) [Computer software]
+Chen, D. (2023). MMKit-Features: Multimodal Features Extraction Toolkit (Version 0.0.2) [Computer software]
 
@@ -1,8 +1,8 @@
 # MMKit-Features Documents
 
-This section presents a summary of usage of the features used in the MMKit-Features Python library. 
+This section presents a summary of usage of the features used in the `MMKit-Features` Python library. 
 
-There are several modules used to implement different functions to cope with multimodal features extraction, namely text, image, speech, and video features. Moreover, the toolkit allows us to fuse and store the extracted multimodal features in a rapid and easy manner. 
+To handle the extraction of various multimodal features such as text, image, speech, and video, different modules are utilized. Furthermore, the toolkit enables the quick and simple fusion and storage of the extracted features.
 
 ## Features Extraction
 
 
@@ -1,6 +1,6 @@
 ## Establishing ICD-11 disease coding library
 
-The example demonstrates steps to create a multimodal feature library using the datasets from International Classification of Diseases, Eleventh Revision (ICD-11). The ICD-11 datasets contains massive text description of disease entities and their complicated relationships. It is also sutable for use to show the use of the `mmkit-features` toolkit. 
+This example demonstrates steps to create a multimodal feature library using the datasets from International Classification of Diseases, Eleventh Revision (ICD-11). The ICD-11 datasets contains massive text description of disease entities and their complicated relationships. It is also suitable for use to show the use of the `mmkit-features` toolkit. 
 
 ### Steps 
 
 
@@ -2,7 +2,7 @@
 
 This base class is deprived of the open-source CMU-Multimodal-SDK project which allows us to store multimodal objects like audio and video files. The core features of the ```computational sequence``` in the SDK is to develop a simple way to store each chunk's features in an order in video/audio files. For example, we can divide a 1-minute video into 60 1-second clips which can be stored in a time order. Then each clip is represented by its extracted features. The computation sequence class considers all objects have a basic property which is time. 
 
-In our project, we extend the concept of computational sequence in many ways, specially providing a more common way to store, fuse and retrieve extracted features from all sources. In this section, we firstly describe the basic usage of the computational sequence in our project. 
+In our project, we extended the concept of computational sequence in many ways, specially providing a more common way to store, fuse and retrieve extracted features from all sources. In this section, we firstly describe the basic usage of the computational sequence in our project. 
 
 Here is a toy example to show the use of computational sequence. 
 
@@ -120,4 +120,6 @@ if __name__=="__main__":
 	mydataset.align("compseq_1")
 ```
 
-The above example is a simple toy one and not suitable for complicated multimodal features use. Therefore, based on the `computational sequence`, we developed a brand-new and complicated one named `computatoinal_sequencex` to facilitate a common frame of storing and manipulating multimodal features for high-level applications in many fields. We will discuss the new one in other section. 
+The above example is a simple toy one and not suitable for complicated multimodal features use. Therefore, based on the `computational sequence`, we developed a brand-new and complicated one named `computatoinal_sequencex` to facilitate a common frame of storing and manipulating multimodal features for high-level applications in many fields. 
+
+We will discuss the new one in other section. 
@@ -34,4 +34,4 @@ if __name__=="__main__":
 
 ```
 
-Most of the methods generate word vectors with fixed length to represent text for our analysis. We highly recommend to use GloVe embedding to generate word vectors. 
+Most of the methods generate word vectors with fixed length to represent text for our analysis. We highly recommend you to use GloVe embedding to generate word vectors. 
@@ -1,6 +1,6 @@
 ## Video Features Extraction
 
-Extracting video features from a video file like *.mp4 file is very complicated. There are many frames from the video which are considered as images. But at the same time, we have to consider the temporal information in the video. 
+Extracting video features from a video file like `*.mp4` file is very complicated. There are many frames from the video which are considered as images. But at the same time, we have to consider the temporal information in the video. 
 
 A simple example of extracting video features using the `mmkit-features` toolkit is below: 
 
 
@@ -0,0 +1,11 @@
+## Transformer-based Feature Extraction
+
+We also integrate the state-of-the-art Transformer-based methods to extract features based on a series of large-scale pretrained models. 
+
+### Examples
+
+1. Text-based Transformer-based feature extraction using [Transformer-XL](https://huggingface.co/transfo-xl-wt103). 
+
+2. Image-based Transformer-based feature extraction based on [Swin Transformer](https://github.com/microsoft/Swin-Transformer). 
+
+More implementaion of Transformer-based extractors are coming soon. 
@@ -0,0 +1,32 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+from .swin_transformer import SwinTransformer
+
+
+def build_model(config, encoder='swintransformer'):
+    if encoder == 'swintransformer':
+        model = SwinTransformer(img_size=config.DATA.IMG_SIZE,
+                                patch_size=config.MODEL.SWIN.PATCH_SIZE,
+                                in_chans=config.MODEL.SWIN.IN_CHANS,
+                                num_classes=config.MODEL.NUM_CLASSES,
+                                embed_dim=config.MODEL.SWIN.EMBED_DIM,
+                                depths=config.MODEL.SWIN.DEPTHS,
+                                num_heads=config.MODEL.SWIN.NUM_HEADS,
+                                window_size=config.MODEL.SWIN.WINDOW_SIZE,
+                                mlp_ratio=config.MODEL.SWIN.MLP_RATIO,
+                                qkv_bias=config.MODEL.SWIN.QKV_BIAS,
+                                qk_scale=config.MODEL.SWIN.QK_SCALE,
+                                ape=config.MODEL.SWIN.APE,
+                                patch_norm=config.MODEL.SWIN.PATCH_NORM)
+        # 固定网络参数
+        for para in model.parameters():
+            para.requires_grad = False
+    else:
+        raise NotImplementedError(f"Unknown model: {encoder}")
+
+    return model
@@ -0,0 +1,106 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------'
+
+import os
+import yaml
+from yacs.config import CfgNode as CN
+
+_C = CN()
+
+# Base config files
+_C.BASE = ['']
+_C.TAG = "default"
+
+# -----------------------------------------------------------------------------
+# Data settings
+# -----------------------------------------------------------------------------
+_C.DATA = CN()
+# Batch size for a single GPU, could be overwritten by command line argument
+_C.DATA.BATCH_SIZE = 1
+# Path to dataset, could be overwritten by command line argument
+_C.DATA.DATA_PATH = r'D:\UIBE科研\国自科青年\多模态机器学习\projects\mmkit-features\examples\birds_features_lib\datasets\CUB_200_2011\images'
+_C.DATA.DATABASE_PATH = './database/DB.npz'
+# Path to index table
+_C.DATA.INDEX_PATH = './database/index.txt'
+# Input image size
+_C.DATA.IMG_SIZE = 224
+# Interpolation to resize image (random, bilinear, bicubic)
+_C.DATA.INTERPOLATION = 'bicubic'
+# Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.
+_C.DATA.PIN_MEMORY = True
+# Number of data loading threads
+_C.DATA.NUM_WORKERS = 8
+
+# -----------------------------------------------------------------------------
+# Model settings
+# -----------------------------------------------------------------------------
+_C.MODEL = CN()
+# Model type
+_C.MODEL.TYPE = 'swin'
+# Model name
+_C.MODEL.NAME = 'swin_tiny_patch4_window7_224'
+# num classes
+_C.MODEL.NUM_CLASSES = 1000
+
+# Swin Transformer parameters
+_C.MODEL.SWIN = CN()
+_C.MODEL.SWIN.PATCH_SIZE = 4
+_C.MODEL.SWIN.IN_CHANS = 3
+_C.MODEL.SWIN.EMBED_DIM = 96
+_C.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+_C.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+_C.MODEL.SWIN.WINDOW_SIZE = 7
+_C.MODEL.SWIN.MLP_RATIO = 4.
+_C.MODEL.SWIN.QKV_BIAS = True
+_C.MODEL.SWIN.QK_SCALE = None
+_C.MODEL.SWIN.APE = False
+_C.MODEL.SWIN.PATCH_NORM = True
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as f:
+        yaml_cfg = yaml.load(f, Loader=yaml.FullLoader)
+
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('=> merge config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+
+def update_config(config, args):
+    _update_config_from_file(config, args.cfg)
+
+    config.defrost()
+
+    # merge from specific arguments
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+
+
+    # set local rank for distributed training
+    config.LOCAL_RANK = args.local_rank
+
+    config.freeze()
+
+
+def get_config(args):
+    """Get a yacs CfgNode object with default values."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    update_config(config, args)
+
+    return config
Original file line number	Diff line number	Diff line change
`@@ -34,4 +34,4 @@ if __name__=="__main__":`
`34`	`34`
`35`	`35`	```
`36`	`36`
`37`		`-Most of the methods generate word vectors with fixed length to represent text for our analysis. We highly recommend to use GloVe embedding to generate word vectors.`
	`37`	`+Most of the methods generate word vectors with fixed length to represent text for our analysis. We highly recommend you to use GloVe embedding to generate word vectors.`