
    wiK                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 ddl
mZ ddl
mZ d Zd ZddZddZddZ G d d          Z G d d          Zd Zd dZd dZd Zed!d            Zd dZdS )"    N)contextmanager)AnyDictList   )language)runtimec                     d                     |           } dddd| z   dg}t          j        |          }|                    t          j        j                                      d          }d |D             }|S )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounitsc                 ,    g | ]}t          |          S  )int.0xs     ^/root/.openclaw/workspace/chatterbox_venv_py311/lib/python3.11/site-packages/triton/testing.py
<listcomp>znvsmi.<locals>.<listcomp>   s    


a3q66


    )join
subprocesscheck_outputdecodesysstdoutencodingsplit)attrscmdoutrets       r   nvsmir$      ss    HHUOOEsNU$:<[
\C

!#
&
&C
**SZ(
)
)
/
/
4
4C

3


CJr   c                 V   dd l }|_|                    | |                    ||j                                                            }t          |          dk    r|d         }|S |dk    r|                                 S  t          ||          |                                           S )Nr   dtyper   all)torchquantiletensorfloattolistlengetattritem)times	quantilesreturn_moder)   r#   s        r   _summarize_statisticsr4      s    LLLnnUELL%+L$N$NOOVVXXs88q==a&C
e||~~&75+&&u--22444r      meanc                    ddl }|dv sJ |j                            |j                                                  5   |              |5|D ]2}|                                 |                    d           d|_        3|j                            d          }|j                            d          }|                                 t          d          D ]}	 |              |                                 |j        
                                 |                    |          dz  }
t          dt          ||
z                      }|j                                        }|j                            |          5  t          |          D ]}	||D ]	}d|_        
 |              	 ddd           n# 1 swxY w Y   |j        
                                 g }d}t          |          D ]}	|j                            d          }|j                            d          }|                                 |                                 |                                 |j        
                                 ||                    |          |z  gz  }t#          |                    |          ||          cddd           S # 1 swxY w Y   dS )	a  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".
    :type return_mode: str
    r   Nminmaxr6   medianr(   Tenable_timing   r   
   )r)   cudastreamStreamdetach_requires_grad_gradEventrecordrangesynchronizeelapsed_timer:   r   	CUDAGraphgraphreplayr4   r+   )fnrepgrad_to_noner2   r3   r)   r   start_event	end_event_estimate_msn_repeatgr#   	n_retriess                  r   do_bench_cudagraphrX       sv    LLLAAAAA			5:,,..	/	/ ,P ,P
#!  		  &&& j&&T&::J$$4$88	q 	 	ABDDDD
   !..y99A=q#cK/0011 J  ""Za   	 	8__  +) & &!%		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
   	y!! 	D 	DA****>>K
((t(<<I   HHJJJJ""$$$K,,Y77(BCCCC$U\\#%6%6	;OOY,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,P ,Ps7   E K<+F4(K4F8	8K;F8	<C<KK	K	   d   c                    |dv sJ ddl }t          j        j                                         |                                               t          j        j                                        }                    d          }                    d          }	|                                 t          d          D ] }
|
                                  |              !|	                                                                  |                    |	          dz  }t          dt          ||z                      }t          dt          ||z                      }fdt          |          D             }fd	t          |          D             }	t          |          D ]}
 |              t          |          D ]b}||D ]	}d|_        
|
                                 ||                                           |              |	|                                          c                                 |                    d
 t!          ||	          D             |j                  }t%          |||          S )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float], optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all" Default is "mean".    :type return_mode: str
    r8   r   NTr<   r>   r   c                 <    g | ]}                     d           S Tr<   rF   r   idis     r   r   zdo_bench.<locals>.<listcomp>   s'    IIIA288$8//IIIr   c                 <    g | ]}                     d           S r]   r^   r_   s     r   r   zdo_bench.<locals>.<listcomp>   s'    GGG!--GGGr   c                 >    g | ]\  }}|                     |          S r   )rJ   )r   ses      r   r   zdo_bench.<locals>.<listcomp>   s(    TTT1!..++TTTr   r&   )r)   r	   driveractiveget_device_interfacerI   get_empty_cache_for_benchmarkrF   rG   rH   zero_rJ   r:   r   rE   r+   zipr,   r4   )rN   warmuprO   rP   r2   r3   r)   cacherQ   rR   rS   rT   n_warmuprU   r`   r   r1   ra   s                    @r   do_benchro   _   st   " AAAAALLL			3	3	5	5BBDDDNNN!??AAE (((..Kt,,I1XX  
NN**9559K 1c&;.//00H1c#+,,--HIIIIxIIIKGGGGuXGGGI8__  
8__   #!  A
!NNLLTTK8S8STTT\a\gLhhE 	;???r    c                    ddl }ddl}t          | |j                  s|                    |           } t          ||j                  s|                    |          }|d}t          |          r || j                  n|}|d}t          |          r || j                  n|}t          | |j                  r\| j        |j        k    r|                                 } | 	                                
                                                                 } t          ||j                  r\|j        |j        k    r|                                }|	                                
                                                                 }| j        dk    s|j        dk    r!|j                            | |||d           dS |                    | |||          st          | d	|  d
| d| d| d
          dS )a  
    Asserts that two inputs are close within a certain tolerance.

    :param x: The first input.
    :type x: scala, list, numpy.ndarray, or torch.Tensor
    :param y: The second input.
    :type y: scala, list, numpy.ndarray, or torch.Tensor
    :param atol: The absolute tolerance. Default value is 1e-2.
    :type atol: float, optional
    :param rtol: The relative tolerance. Default value is 0.
    :type rtol: float, optional
    :param err_msg: The error message to use if the assertion fails.
    :type err_msg: str
    r   Ng{Gz?g        r   T)atolrtol	equal_nan)rr   rs    z is not close to z (atol=z, rtol=))numpyr)   
isinstanceTensorr+   callabler'   bfloat16r,   cpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrr   rs   err_msgnpr)   s          r   assert_closer      s    LLL a&& LLOOa&& LLOO|$TNN444===D|$TNN444===D !U\"" %7en$$		AEEGGNN""$$!U\"" %7en$$		AEEGGNN""$$ 	vzzQVaZZ

""1ad"NNN;;q!$T;22 ^\\!\\a\\\\UY\\\]]]^ ^r   c                       e Zd ZdZ	 	 	 	 	 ddee         dee         dedee         d	ee         d
edeeef         dedededefdZ	dS )	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rp   FNx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     || _         || _        |
| _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        dS )aq  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        :param styles: A list of tuples, where each tuple contains two elements: a color and a linestyle.
        :type styles: list[tuple[str, str]]
        N)r   r   r   r   r   r   r   stylesr   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   s                r   __init__zBenchmark.__init__   s]    ^ 
 "$
"			r   )rp   rp   FFN)
__name__
__module____qualname____doc__r   strr   r   boolr   r   r   r   r   r      s          ; ;c; S	; 	;
 9; I; ; 38n; ; ; ; ; ; ; ; ; ;r   r   c            	       :    e Zd Zd Z	 	 ddedededefdZdd
ZdS )Markc                 "    || _         || _        d S NrN   
benchmarks)r   rN   r   s      r   r   zMark.__init__  s    $r   F   bench	save_path
show_plots
print_datac                 .	   dd l }dd lm}	 dd l}
|j        }d |j        D             }d |j        D             }t          |j                  }|
                    ||z   |z   |z             }|j        D ]t          t
          t          f          sfd|D             t                    t          |          k    r"t          dt          |           d           t          t          |                    }g g g }}}|j        D ]Q} | j        di ||j        |i|j        |}	 |\  }}}n# t&          $ r	 |d d }}}Y nw xY w||gz  }||gz  }||gz  }Rt                    |z   |z   |z   |j        t          |          <   |j        r4|	                                 |	                                }|d         }t1          |j                  D ]\  }}||dz            ||d	z            }}|j        r|j        |         d         nd }|j        r|j        |         d
         nd }|                    ||         ||         |||           |                                                                sz|                                                                sT|                    t<                    }|                    t<                    }|                    ||         ||d|           |                                  |!                    |j"        p|           |#                    |j$                   |%                    |j&        rdnd           |'                    |j(        rdnd           |r|	)                                 |r6|	*                    |j+        ,                    ||j         d                     |||j        z            }|rA|j-        d
         dk    r0|j.        /                                \  }}||         ||         z
  |d<   |r8ta          |j        dz              ta          |1                                           |r=|2                    |j+        ,                    ||j         d          d| dd           |S )Nr   c                     g | ]}| d S )-minr   r   s     r   r   zMark._run.<locals>.<listcomp>$      666A666r   c                     g | ]}| d S )-maxr   r   s     r   r   zMark._run.<locals>.<listcomp>%  r   r   )columnsc                     g | ]}S r   r   )r   rS   r   s     r   r   zMark._run.<locals>.<listcomp>+  s    (((1Q(((r   z	Expected z values, got r   r   r   )labelcolorlsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr   	DataFramer   rx   tupler.   
ValueErrordictrk   r   rN   r   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullr(   astyper,   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   r-   print	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meany_miny_maxr   dfx_argsrow_meanrow_minrow_maxr   r#   axfirst_xr`   colstycol0col1r   s                                @r   _runz	Mark._run  s   			''''''!66U%566666U%5666u}%%\\'F"2U":U"B\CC 	E 	EAa$// )(((((((1vvW%% !KS\\!K!K!K!KLLL#gq//**F)+RwgH_ # #dgVVV5>1*=VVvVV;+.(FE55  ; ; ;+.d5EFFF;VH$E7"E7""1gg07:WDBF3r77OO? 	OJJLLLBajG!%"233 V V1!!f*~r!f*~u,1LBel1oa((d,1LBel1oa((d7RU!33GGG||~~))++ VELLNN4F4F4H4H V!LL//E!LL//EOOBwKTQTOUUUIIKKKMM%,1'222MM%,'''MM5;<%%H===MM5;<%%H=== 


 OBGLLu4L4L4LMMNNN%**+ 	-q((**,,JD$DBtH,BvJ 	"%/C'(((",,..!!! 	#IIbgll9.F.F.FGGVl[iVlVlVl!  # # #	s   .D55EErp   c           	         t          | j        t                    }|r| j        gn| j        }g }|rYt          j        |d           t          t          j                            |d          d          }	|	                    d           |D ]F}
|	                     | j
        |
|||fi |           |r|	                    d|
j         d           G|r)|	                    d           |	                                 |r|r|d	         S |S d S )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rx   r   r   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   s              r   runzMark.runb  s5   %doyAA*:Odo&&

 	)K	D1111Y??EEDJJ'((( 	H 	HEidiy*j[[TZ[[\\\ H

F5?FFFGGG 	JJ)***JJLLL 	" "!!}$!!tr   N)Fr   )FFrp   F)	r   r   r   r   r   r   r   r   r   r   r   r   r   r     s        % % % chC C) C C CSW C C C CJ     r   r   c                       fd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                 $    t          |           S r   )r   r   s    r   <lambda>zperf_report.<locals>.<lambda>  s    b*-- r   r   )r   wrappers   ` r   perf_reportr  z  s     .---GNr   c                     ddl }ddlm} | s|j                                        } |j        j                            |           d         }|j        j                            |           d         }||z  dz  dz  d	z  }|S )
z return DRAM bandwidth in GB/s r   Nr   rf   mem_clock_ratemem_bus_widthr   g    .A   )r)   r	   rf   r@   current_devicerg   utilsget_device_properties)devicer)   rf   mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr    s    LLL -**,,M'==fEEFVWM#99&AA/RIi'!+c1A5GNr   c                    dd l }ddlm} |s|j                                        }|j        j                            |          d         dz  }|j                            |          }|d         dk     r| |j	        k    sJ d}ni| |j
        |j        fv rd}nV| |j	        |j        |j        fv rd}n=| |j        t          j        t          j        t          j        fv rd	}nt'          d
          ||z  |z  dz  }|S )Nr   r   r  multiprocessor_count   r     i   i   dtype not supported&.>)r)   r	   rf   r@   r  rg   r  r	  get_device_capabilityfloat16float32int32r{   int16int8tl
float8e4nvfloat8e4b15float8e5RuntimeError	r'   
clock_rater
  r)   rf   num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr&    s(   LLL -**,,=&<<VDDE[\_``L11&99J!}q%%%%U]EK000"u}enekBBB"uz2="."+NNN#4555J&)99D@FMr   c                        fd}|S )Nc                 J     t          j                    fd            }|S )Nc                  p   dd l }|                    t          j                                                              }
                                |                                k    }|r|dk    rt          j                            j        d                   }t          j	        d         dd}d|v s
J d            |d         j
        j        j        }| d	j         d
| d}t          j        ddd|gd|          }	|	j        dk    s
J d            dt#          |	j                  v sJ d S  | i | d S )Nr   zcuda-memcheck__file__PATH1)r+  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )r   r   r4  	ppid_namerun_cuda_memcheckr   r3  test_idr!   r"   target_kwargstest_fns             r   r   z1cuda_memcheck.<locals>.decorator.<locals>.wrapper  sW   MMMrz||4499;;I - 3 3 5 5 G  )Y/%A%Aw''(;J(GHH!z&1UXYY F***,n*** +09<>>!1>>G>>> nox%L]agjkkk~***,e***0C
OOCCCCCC((((((r   )	functoolswraps)rD  r   rC  s   ` r   	decoratorz cuda_memcheck.<locals>.decorator  s>    		!	!	) 	) 	) 	) 	) 
"	!	)" r   r   )rC  rG  s   ` r   cuda_memcheckrH    s$        , r   F    c           	   #     K   	 t          j        g d           t          j        dddd|  d|  g           t          j        dddd| d| g           t          dg          d	         }t          d
g          d	         }t          || z
            dk     sJ d|  d            t          ||z
            dk     sJ d| d            d| z  }d|z  dz  }||fV  t          j        g d           t          j        g d           t          j        g d           d S # t          j        g d           t          j        g d           t          j        g d           w xY w)N)r   r   r   -pmr,  r   r   r   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr?   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   rL  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r$   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr%  gbpss         r   set_gpu_clockrS    s     C E E EFFF>>>>>	!
 	 	 	 	CMCCMCC	!
 	 	 	 1233A66788;<,.//"4446_\6_6_6_444==011B6668b}8b8b8b666)L8&-dl E E EFFF A A ABBB A A ABBBBB 	 E E EFFF A A ABBB A A ABBBBs   CD! !AE%c                    dd l }ddlm} |s|j                                        }|j        j                            |          d         dz  }|j                                        }|d         dk     r+| |j	        k    rd}nM| |j
        k    rd}n?t          d	          | |j	        k    rd}n"| |j
        |j        fv rd}nt          d	          ||z  |z  d
z  }|S )Nr   r   r  r  r  r      @   r  r  )r)   r	   rf   r@   r  rg   r  r	  r  r  r  r  r{   r   s	            r   get_max_simd_tflopsrW    s   LLL -**,,=&<<VDDE[\_``L1133J!}qEM!!!em##!4555EM!!!u}en555!4555J&)99D@FMr   )r5   NNr6   )rY   rZ   NNr6   )NNrp   r   )rI  rJ  )rE  r   r   r   
contextlibr   typingr   r   r   rp   r   r  r	   r$   r4   rX   ro   r   r   r   r  r  r&  rH  rS  rW  r   r   r   <module>rZ     s       				     



 % % % % % % " " " " " " " " " "              	5 	5 	5<P <P <P <P~?@ ?@ ?@ ?@D0^ 0^ 0^ 0^f@ @ @ @ @ @ @ @F` ` ` ` ` ` ` `F  
 
 
 
   :  6 C C C C8     r   