/********************************************************* * * File: p5prof.h * By: Kevin Baca * * MODIFIED BY Fab SO THAT RDMSR(...) WRITES EDX : EAX TO A LONG LONG * (WHICH MEANS WRITE THE LOW DWORD FIRST) * * Now in yer code do: * INT64 count,total; * * ... * RDMSR(0x10,&count); //inner loop count * total += count; * ... * * printf("0x%x %x", (INT32)total, *((INT32 *)&total+1)); * // HIGH LOW * *********************************************************/ /**\file \brief This file provides macros to profile your code. Here's how they work... As you may or may not know, the Pentium class of processors provides extremely fine grained profiling capabilities through the use of what are called Machine Specific Registers (MSRs). These registers can provide information about almost any aspect of CPU performance down to a single cycle. The MSRs of interest for profiling are specified by indices 0x10, 0x11, 0x12, and 0x13. Here is a brief description of each of these registers: MSR 0x10 This register is simple a cycle counter. MSR 0x11 This register controls what type of profiling data will be gathered. MSRs 0x12 and 0x13 These registers gather the profiling data specified in MSR 0x11. Each MSR is 64 bits wide. For the Pentium processor, only the lower 32 bits of MSR 0x11 are valid. Bits 0-15 specify what data will be gathered in MSR 0x12. Bits 16-31 specify what data will be gathered in MSR 0x13. Both sets of bits have the same format: Bits 0-5 specify which hardware event will be tracked. Bit 6, if set, indicates events will be tracked in rings 0-2. Bit 7, if set, indicates events will be tracked in ring 3. Bit 8, if set, indicates cycles should be counted for the specified event. If clear, it indicates the number of events should be counted. Two instructions are provided for manupulating the MSRs. RDMSR (Read Machine Specific Register) and WRMSR (Write Machine Specific Register). These opcodes were originally undocumented and therefore most assemblers don't recognize them. Their byte codes are provided in the macros below. RDMSR takes the MSR index in ecx and the profiling criteria in edx : eax. WRMSR takes the MSR index in ecx and returns the profile data in edx : eax. Two profiling registers limits profiling capability to gathering only two types of information. The register usage can, however, be combined in interesting ways. For example, you can set one register to gather the number of a specific type of event while the other gathers the number of cycles for the same event. Or you can gather the number of two separate events while using MSR 0x10 to gather the number of cycles. The enumerated list provides somewhat readable labels for the types of events that can be tracked. For more information, get ahold of appendix H from the Intel Pentium programmer's manual (I don't remember the order number) or go to http://green.kaist.ac.kr/jwhahn/art3.htm. That's an article by Terje Mathisen where I got most of my information. You may use this code however you wish. I hope it's useful and I hope I got everything right. -Kevin kbaca@skygames.com */ #ifdef __GNUC__ #define RDTSC(_dst) \ __asm__(" .byte 0x0F,0x31 movl %%edx,(%%edi) movl %%eax,4(%%edi)"\ : : "D" (_dst) : "eax", "edx", "edi") // the old code... swapped it // movl %%edx,(%%edi) // movl %%eax,4(%%edi)" #define RDMSR(_msri, _msrd) \ __asm__(" .byte 0x0F,0x32 movl %%eax,(%%edi) movl %%edx,4(%%edi)"\ : : "c" (_msri), "D" (_msrd) : "eax", "ecx", "edx", "edi") #define WRMSR(_msri, _msrd) \ __asm__(" xorl %%edx,%%edx .byte 0x0F,0x30"\ : : "c" (_msri), "a" (_msrd) : "eax", "ecx", "edx") #define RDMSR_0x12_0x13(_msr12, _msr13) \ __asm__(" movl $0x12,%%ecx .byte 0x0F,0x32 movl %%edx,(%%edi) movl %%eax,4(%%edi) movl $0x13,%%ecx .byte 0x0F,0x32 movl %%edx,(%%esi) movl %%eax,4(%%esi)"\ : : "D" (_msr12), "S" (_msr13) : "eax", "ecx", "edx", "edi") #define ZERO_MSR_0x12_0x13() \ __asm__(" xorl %%edx,%%edx xorl %%eax,%%eax movl $0x12,%%ecx .byte 0x0F,0x30 movl $0x13,%%ecx .byte 0x0F,0x30"\ : : : "eax", "ecx", "edx") #elif defined (__WATCOMC__) extern void RDTSC(UINT32 *dst); #pragma aux RDTSC =\ "db 0x0F,0x31"\ "mov [edi],edx"\ "mov [4+edi],eax"\ parm [edi]\ modify [eax edx edi]; extern void RDMSR(UINT32 msri, UINT32 *msrd); #pragma aux RDMSR =\ "db 0x0F,0x32"\ "mov [edi],edx"\ "mov [4+edi],eax"\ parm [ecx] [edi]\ modify [eax ecx edx edi]; extern void WRMSR(UINT32 msri, UINT32 msrd); #pragma aux WRMSR =\ "xor edx,edx"\ "db 0x0F,0x30"\ parm [ecx] [eax]\ modify [eax ecx edx]; extern void RDMSR_0x12_0x13(UINT32 *msr12, UINT32 *msr13); #pragma aux RDMSR_0x12_0x13 =\ "mov ecx,0x12"\ "db 0x0F,0x32"\ "mov [edi],edx"\ "mov [4+edi],eax"\ "mov ecx,0x13"\ "db 0x0F,0x32"\ "mov [esi],edx"\ "mov [4+esi],eax"\ parm [edi] [esi]\ modify [eax ecx edx edi esi]; extern void ZERO_MSR_0x12_0x13(void); #pragma aux ZERO_MSR_0x12_0x13 =\ "xor edx,edx"\ "xor eax,eax"\ "mov ecx,0x12"\ "db 0x0F,0x30"\ "mov ecx,0x13"\ "db 0x0F,0x30"\ modify [eax ecx edx]; #endif typedef enum { DataRead, DataWrite, DataTLBMiss, DataReadMiss, DataWriteMiss, WriteHitEM, DataCacheLinesWritten, DataCacheSnoops, DataCacheSnoopHit, MemAccessBothPipes, BankConflict, MisalignedDataRef, CodeRead, CodeTLBMiss, CodeCacheMiss, SegRegLoad, RESERVED0, RESERVED1, Branch, BTBHit, TakenBranchOrBTBHit, PipelineFlush, InstructionsExeced, InstructionsExecedVPipe, BusUtilizationClocks, PipelineStalledWriteBackup, PipelineStalledDateMemRead, PipeLineStalledWriteEM, LockedBusCycle, IOReadOrWriteCycle, NonCacheableMemRef, AGI, RESERVED2, RESERVED3, FPOperation, Breakpoint0Match, Breakpoint1Match, Breakpoint2Match, Breakpoint3Match, HWInterrupt, DataReadOrWrite, DataReadOrWriteMiss }; #define PROF_CYCLES (0x100) #define PROF_EVENTS (0x000) #define RING_012 (0x40) #define RING_3 (0x80) #define RING_0123 (RING_012 | RING_3) /*void ProfSetProfiles(UINT32 msr12, UINT32 msr13);*/ #define ProfSetProfiles(_msr12, _msr13)\ {\ UINT32 prof;\ \ prof = (_msr12) | ((_msr13) << 16);\ WRMSR(0x11, prof);\ } /*void ProfBeginProfiles(void);*/ #define ProfBeginProfiles()\ ZERO_MSR_0x12_0x13(); /*void ProfGetProfiles(UINT32 msr12[2], UINT32 msr13[2]);*/ #define ProfGetProfiles(_msr12, _msr13)\ RDMSR_0x12_0x13(_msr12, _msr13); /*void ProfZeroTimer(void);*/ #define ProfZeroTimer()\ WRMSR(0x10, 0); /*void ProfReadTimer(UINT32 timer[2]);*/ #define ProfReadTimer(timer)\ RDMSR(0x10, timer); /*EOF*/