Two years ago I along with other team members created, and supported QEMU (0.15.0) system emulation of some custom board for some embedded product.
We modeled all the devices then we created a new system et voila we had a simulator. It was perfect, we could start our FW development long before we had a real system. However something was missing. Fore more accurate simulation we needed LRU. As far as I know there is no such a device/facility in QEMU. I would like to share my solution.
- in order to implement LRU we should keep track of all memory accesses i.e. read, write and fetch.
- following is a little background for understanding how target memory is implemented.
- In order to create a target one should use qemu_ram_alloc and cpu_register_physical_memory APIs. cpu_register_physical_memory function associates target physical address with a host buffer created by qemu_ram_alloc.
- QEMU has the following structure defined in cpu-defs.h
target_ulong addr_read;
CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];
- When there is an access to memory e.g. read from address x, QEMU checks whether tlb_table[cpu_mmu_index(env)][(x >> TARGET_PAGE_BITS) % CPU_TLB_SIZE].addr_read == (x >> TARGET_PAGE_BITS).
- if it does not then cpu_handle_mmu_fault is called and fills up the entry so that addr_read = (x >> TARGET_PAGE_BITS) and addr_read + addend points into host memory buffer, previously allocated by qemu_ram_alloc .
- otherwise QEMU just accesses from x + addend address in the host memory.
- so, in order to implement LRU following things should be done
1. add a global CPU counter of memory accesses ( tlb_timer ) and initialize it to zero
#define CPU_COMMON_TLB \
...
unsigned long tlb_timer;
typedef struct CPUTLBEntry {
target_ulong addr_read;
target_ulong addr_write;
target_ulong addr_code;
uintptr_t addend;
...
...
} CPUTLBEntry;
CPUTLBEntry tlb_table[NB_MMU_MODES][CPU_TLB_SIZE];
- When there is an access to memory e.g. read from address x, QEMU checks whether tlb_table[cpu_mmu_index(env)][(x >> TARGET_PAGE_BITS) % CPU_TLB_SIZE].addr_read == (x >> TARGET_PAGE_BITS).
- if it does not then cpu_handle_mmu_fault is called and fills up the entry so that addr_read = (x >> TARGET_PAGE_BITS) and addr_read + addend points into host memory buffer, previously allocated by qemu_ram_alloc .
- otherwise QEMU just accesses from x + addend address in the host memory.
- so, in order to implement LRU following things should be done
1. add a global CPU counter of memory accesses ( tlb_timer ) and initialize it to zero
#define CPU_COMMON_TLB \
...
unsigned long tlb_timer;
2. add a time stamp of the last memory access to each tlb entry
typedef struct CPUTLBEntry
{
target_ulong addr_read;
target_ulong addr_write;
target_ulong addr_code;
unsigned long addend;
signed long timestamp; // timestamp of last access to this entry
...
...
} CPUTLBEntry;
3. update the time stamp for code fetches in tb_find_fast function. just before the end of the function add the following.
if((te->addr_code & TARGET_PAGE_MASK) == (pc & TARGET_PAGE_MASK))
{
env->tlb_timer++;
te->timestamp = env->tlb_timer;
}
if((te->addr_code & TARGET_PAGE_MASK) == (pc & TARGET_PAGE_MASK))
{
env->tlb_timer++;
te->timestamp = env->tlb_timer;
}
4. modify
tcg_out_tlb_load function that actually implements the procedure of read/write memory access described above. I did it for x86-host only. it should be done for each host architecture QEMU is running on.
static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
int mem_index, int s_bits,
const TCGArg *args,
uint8_t **label_ptr, int which)
{
const int addrlo = args[addrlo_idx];
const int r0 = tcg_target_call_iarg_regs[0];
const int r1 = tcg_target_call_iarg_regs[1];
TCGType type = TCG_TYPE_I32;
int rexw = 0;
if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64) {
type = TCG_TYPE_I64;
rexw = P_REXW;
}
//
tcg_out_mov(s, type, r1, addrlo);
tcg_out_mov(s, type, r0, addrlo);
tcg_out_shifti(s, SHIFT_SHR + rexw, r1,
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
tgen_arithi(s, ARITH_AND + rexw, r0,
TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
tgen_arithi(s, ARITH_AND + rexw, r1,
(CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, r1, TCG_AREG0, r1, 0,
offsetof(CPUState, tlb_table[mem_index][0])
+ which);
/* cmp 0(r1), r0 */
tcg_out_modrm_offset(s, OPC_CMP_GvEv + rexw, r0, r1, 0);
tcg_out_mov(s, type, r0, addrlo);
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[0] = s->code_ptr;
s->code_ptr++;
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
/* cmp 4(r1), addrhi */
tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[1] = s->code_ptr;
s->code_ptr++;
}
/* TLB Hit. */
// this->tlb_timer++
tcg_out_modrm_offset(s, OPC_LEA + P_REXW, r0, TCG_AREG0, offsetof(CPUState, tlb_timer));
tcg_out_modrm_offset(s, OPC_GRP5 + rexw, EXT5_INC_Ev, r0, 0);
// r0 = this->tlb_timer
tcg_out_ld(s, type, r0, r0, 0);
// tlb_entry->timestamp = r0
tcg_out_st(s, type, r0, r1, offsetof(CPUTLBEntry, timestamp) - which);
/* add addend(r1), r0 */
tcg_out_mov(s, type, r0, addrlo);
tcg_out_modrm_offset(s, OPC_ADD_GvEv + P_REXW, r0, r1,
offsetof(CPUTLBEntry, addend) - which);
static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
int mem_index, int s_bits,
const TCGArg *args,
uint8_t **label_ptr, int which)
{
const int addrlo = args[addrlo_idx];
const int r0 = tcg_target_call_iarg_regs[0];
const int r1 = tcg_target_call_iarg_regs[1];
TCGType type = TCG_TYPE_I32;
int rexw = 0;
if (TCG_TARGET_REG_BITS == 64 && TARGET_LONG_BITS == 64) {
type = TCG_TYPE_I64;
rexw = P_REXW;
}
//
tcg_out_mov(s, type, r1, addrlo);
tcg_out_mov(s, type, r0, addrlo);
tcg_out_shifti(s, SHIFT_SHR + rexw, r1,
TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
tgen_arithi(s, ARITH_AND + rexw, r0,
TARGET_PAGE_MASK | ((1 << s_bits) - 1), 0);
tgen_arithi(s, ARITH_AND + rexw, r1,
(CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, r1, TCG_AREG0, r1, 0,
offsetof(CPUState, tlb_table[mem_index][0])
+ which);
/* cmp 0(r1), r0 */
tcg_out_modrm_offset(s, OPC_CMP_GvEv + rexw, r0, r1, 0);
tcg_out_mov(s, type, r0, addrlo);
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[0] = s->code_ptr;
s->code_ptr++;
if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
/* cmp 4(r1), addrhi */
tcg_out_modrm_offset(s, OPC_CMP_GvEv, args[addrlo_idx+1], r1, 4);
/* jne label1 */
tcg_out8(s, OPC_JCC_short + JCC_JNE);
label_ptr[1] = s->code_ptr;
s->code_ptr++;
}
/* TLB Hit. */
// this->tlb_timer++
tcg_out_modrm_offset(s, OPC_LEA + P_REXW, r0, TCG_AREG0, offsetof(CPUState, tlb_timer));
tcg_out_modrm_offset(s, OPC_GRP5 + rexw, EXT5_INC_Ev, r0, 0);
// r0 = this->tlb_timer
tcg_out_ld(s, type, r0, r0, 0);
// tlb_entry->timestamp = r0
tcg_out_st(s, type, r0, r1, offsetof(CPUTLBEntry, timestamp) - which);
/* add addend(r1), r0 */
tcg_out_mov(s, type, r0, addrlo);
tcg_out_modrm_offset(s, OPC_ADD_GvEv + P_REXW, r0, r1,
offsetof(CPUTLBEntry, addend) - which);
}
5. add a timer that one a minute will update all timestamp in all valid entries so they all are in range [0 ... N), where N is the number of valid entries, and tlb_timer is N-1. This will prevent tlb_timer overflow.
Having done all this we our specific LRU device & registers which upon request to provide least recently used physical page examined all valid entries in tlb_table and returned the one with lowest time stmap.